lai.tex

\documentclass[prd,amsmath,aps,floats,amssymb, floatfix,
  superscriptaddress,nofootinbib]{revtex4-1}
%% removed linenumbers


\usepackage{amsmath,amssymb,natbib,latexsym,times}
%\usepackage{ulem} % remove before resubmission

\DeclareMathOperator\arctanh{arctanh}
\usepackage[switch,columnwise]{lineno}
\renewcommand{\topfraction}{0.99}
% To solve problem with mnras not understanding Type 3 fonts
\usepackage[T1]{fontenc}
\usepackage{aecompl}
\newcommand{\mr}{\mathrm}
%%%% Scott's macros
\newcommand{\sfig}[2]{
\includegraphics[width=#2]{#1}
        }
\newcommand{\Sfig}[2]{
    \begin{figure}[thbp]
    \sfig{Figures/#1.pdf}{.7\columnwidth}
    \caption{{\small #2}}
    \label{fig:#1}
    \end{figure}
}
\newcommand{\Swide}[2]{
\begin{figure*}[thbp]
 \sfig{Figures/#1.pdf}{.8\textwidth}
  \caption{{\small #2}}
   \label{fig:#1}
   \end{figure*}
}
\newcommand{\Sswide}[2]{
\begin{figure*}[thbp]
 \sfig{Figures/#1.pdf}{.7\textwidth}
  \caption{{\small #2}}
   \label{fig:#1}
   \end{figure*}
}
\newcommand{\Svwide}[2]{
\begin{figure*}[thbp]
 \sfig{Figures/#1.pdf}{\textwidth}
  \caption{{\small #2}}
   \label{fig:#1}
   \end{figure*}
}

\newcommand{\Sjpg}[2]{
    \begin{figure}[thbp]
    \sfig{Figures/#1.jpg}{0.65\columnwidth}
    \caption{{\small #2}}
    \label{fig:#1}
    \end{figure}
}
\newcommand{\Spng}[2]{
    \begin{figure}[thbp]
    \sfig{Figures/#1.png}{0.65\columnwidth}
    \caption{{\small #2}}
    \label{fig:#1}
    \end{figure}
}
\newcommand{\Sgif}[2]{
    \begin{figure}[thbp]
    \sfig{Figures/#1.gif}{0.65\columnwidth}
    \caption{{\small #2}}
    \label{fig:#1}
    \end{figure}
}

\newcommand{\Rf}[1]{\ref{fig:#1}}
\newcommand{\rf}[1]{\ref{fig:#1}}
\newcommand{\ec}[1]{Eq.~(\ref{eq:#1})}
\newcommand{\ecalt}[1]{Eq.~\ref{eq:#1}}
\newcommand{\Ec}[1]{(\ref{eq:#1})}
\newcommand{\eeec}[3]{Eqs.~(\ref{eq:#1}, \ref{eq:#2}, \ref{eq:#3})}
\newcommand{\eql}[1]{\label{eq:#1}}
\def\vs{\nonumber\\}


% For various journals
\newcommand{\aap}{A\&A}
\newcommand{\apjs}{ApJS}
\newcommand{\aj}{A.J.}
\newcommand{\mnras}{MNRAS}
\newcommand{\physrev}{Phys. Rev.}
\newcommand{\advastro}{Adv. Astron.}
\newcommand{\jcap}{JCAP}
\newcommand{\apjl}{ApJ}


\usepackage{verbatim}
\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{tabulary}
\usepackage{tabularx}
\usepackage{todonotes}
\usepackage{hyperref}
%\usepackage[draft]{hyperref}
%\usepackage[disable]{todonotes}

% Make equations look like 2.1, 2.2, etc.
\numberwithin{equation}{section}

% Make LaTex more likely to put some text under the figures.
\renewcommand\dbltopfraction{.85}
%\renewcommand\topfraction{.85}
\renewcommand\textfraction{0.1}
\renewcommand{\floatpagefraction}{0.7}
\renewcommand{\dblfloatpagefraction}{0.7}

\newcommand{\assign}[1]{\noindent {\color{RoyalPurple} Lead writer: \textbf{#1}}}
\newcommand{\contrib}[1]{{\color{RoyalPurple} with contributions from \textbf{#1}}}
% \newcommand{\assign}[1]{}
% \newcommand{\contrib}[1]{}

% Number of galaxies in the final catalogs in millions
\newcommand{\ngalngmix}{3.44}
\newcommand{\ngalimshape}{2.12}
% Our resulting neff values
\newcommand{\neffngmix}{5.7}
\newcommand{\neffimshape}{3.7}

\newcommand{\mrequirement}{0.03}
\newcommand{\crequirement}{\ensuremath{2 \times 10^{-3}}}
\newcommand{\tanshearbias}{0.05}

\newcommand\salpha{{\eta_{\rm IA}}}
\newcommand{\greatdes}{GREAT-DES}
\newcommand{\healpix}{HEALPIX}
\newcommand{\fits}{FITS}
\newcommand{\meds}{MEDS}
\newcommand{\medsfull}{Multi-Epoch Data Structures}
\newcommand{\SE}{single-epoch}
\newcommand{\ME}{multi-epoch}
\newcommand{\medszero}{30.0}
\newcommand{\uberseg}{{\"u}berseg}

\newcommand{\gband}{$g$-band}
\newcommand{\rband}{$r$-band}
\newcommand{\iband}{$i$-band}
\newcommand{\zband}{$z$-band}
\newcommand{\Yband}{$Y$-band}
\newcommand{\grizY}{$g$, $r$, $i$, $z$, $Y$}

\newcommand{\photoz}{photo-$z$}
\newcommand\degree{\ensuremath{\,^\circ}}
\newcommand{\snr}{\ensuremath{S/N}}
\newcommand{\snrw}{\ensuremath{(S/N)_w}}
\newcommand{\snrr}{\ensuremath{(S/N)_r}}
\newcommand{\rgp}{\ensuremath{R_{gp}/R_p}}
\newcommand{\epsf}{\ensuremath{e_\textsc{psf}}}
\newcommand{\Tpsf}{\ensuremath{T_\textsc{psf}}}
\newcommand{\Tgal}{\ensuremath{T_\mathrm{gal}}}
\newcommand{\bfx}{\ensuremath{\mathbf{x}}}
\newcommand{\bfxpt}{\ensuremath{\mathbf{x} + \boldsymbol{\theta}}}
\newcommand{\dximax}{\ensuremath{\delta \xi^\mathrm{max}}}
\newcommand{\SN}{\ensuremath{\sigma_\textsc{sn}}}
\newcommand\lcdm{$\Lambda$CDM}
\newcommand\wcdm{$w$CDM}

% cf. http://tex.stackexchange.com/questions/299/how-to-get-long-texttt-sections-to-break
\newcommand*\justify{%
  \fontdimen2\font=0.4em% interword space
  \fontdimen3\font=0.2em% interword stretch
  \fontdimen4\font=0.1em% interword shrink
  \fontdimen7\font=0.1em% extra space
  \hyphenchar\font=`\-% allowing hyphenation
}

% how to display code snippets
\newcommand\code[1]{\texttt{\small\justify #1}}

% sextractor stuff
\newcommand{\sex}{\textsc{SEx\-tractor}}
\newcommand{\psfex}{\textsc{PSF\-Ex}}
\newcommand{\aworld}{\code{A\_WORLD}}
\newcommand{\bworld}{\code{B\_WORLD}}
\newcommand{\frad}{\code{FLUX\_RADIUS}}
\newcommand{\magauto}{\code{MAG\_AUTO}}
\newcommand{\magpsf}{\code{MAG\_PSF}}
\newcommand{\classstar}{\code{CLASS\_STAR}}
\newcommand{\spreadmodel}{\code{SPREAD\_MODEL}}

\DeclareMathOperator{\Tr}{Tr}

% names of codes
\newcommand{\imshape}{{\textsc{im3shape}}}
\newcommand{\ngmix}{\textsc{ngmix}}
\newcommand{\galsim}{\textsc{GalSim}}
\newcommand{\LevMar}{\textsc{LevMar}}
\newcommand{\scamp}{\textsc{SCamp}}
\newcommand{\swarp}{\textsc{SWarp}}
\newcommand{\lensfit}{\textsc{lensfit}}
\newcommand\metacal{{\textsc{metacalibration}}} %{{\tt metacal}}
\newcommand\im{{\textsc{im3shape}}} %{{\tt im3shape}}

\newcommand{\ngmixSN}{0.22}
\newcommand{\ngmixbands}{$r,i,z$}
\newcommand{\vece}{\mbox{\boldmath $e$}}
\newcommand{\vecg}{\mbox{\boldmath $g$}}

\newcommand{\coadd}{coadd}

% disc or disk?
\newcommand{\disk}{disc}

% nominal epochs and depth for main survey
\newcommand{\nomepochs}{10}
\newcommand{\nomdepth}{24.1}

% spt-e stuff
\newcommand{\spte}{SPT-E}
\newcommand{\sptearea}{139}

% models
\newcommand{\devauc}{de Vaucouleurs}
\newcommand{\sersic}{S{\'e}rsic}

\newcommand{\neldermead}{Nelder-Mead}
\newcommand{\levmar}{Levenberg-Marquardt}

% regularize how we refer to equations.
% NB. for 2 equations, use eqnb, since eqn2 is not a valid command name.
\newcommand\eqn[1]{equation~\ref{#1}}
\newcommand\eqnb[2]{equations~\ref{#1}~\& \ref{#2}}
\newcommand\eqnc[2]{equations~\ref{#1}~--~\ref{#2}}
\newcommand\Eqn[1]{Equation~\ref{#1}}   % If you need to start a sentence with this...
\newcommand\Eqnb[2]{Equations~\ref{#1}~\& \ref{#2}}


% Likewise for figures and tables
\newcommand\fig[1]{Figure~\ref{#1}}
\newcommand\figb[2]{Figures~\ref{#1}~\& \ref{#2}}
\newcommand\tab[1]{Table~\ref{#1}}
\newcommand\tabb[2]{Tables~\ref{#1}~\& \ref{#2}}

% Some styles doesn't need the word Appendix before the \ref.  So this way it is easy to switch.
\newcommand\app[1]{Appendix~\ref{#1}}
%\newcommand\app[1]{\ref{#1}}


\newcommand\be{\begin{equation}}
\newcommand\ee{\end{equation}}
\def\bea{\begin{eqnarray}}
\def\eea{\end{eqnarray}}
% Feel free to add your own \note item with a different color.
% It's a handy way to comment on particular locations in the
% text.  
% (Note: the !40 just makes it a little transparent, so easier to read the black text.)
\newcommand\noteol[1]{\todo[color=cyan!40, inline, size=\small]{Ofer: #1}}
\newcommand\scott[1]{\todo[color=blue!40, inline, size=\small]{Scott: #1}}
\newcommand\ESS[1]{\todo[color=orange!40, inline, size=\small]{ESS: #1}}
\newcommand\ADW[1]{\todo[color=green!40, inline, size=\small]{ADW: #1}}
\newcommand\TK[1]{\todo[color=magenta!40, inline, size=\small]{TK: #1}}
\newcommand\MAT[1]{\todo[color=yellow!40, inline, size=\small]{MAT: #1}}
\newcommand\AAP[1]{\todo[color=cyan!60, inline, size=\small]{AAP: #1}}
\newcommand\PM[1]{\todo[color=blue!40, inline, size=\small]{PM: #1}}
\newcommand\GB[1]{\todo[color=red!10, inline, size=\small]{GB: #1}}
\newcommand\DG[1]{\todo[color=green!60, inline, size=\small]{DG: #1}}
\newcommand\ER[1]{\todo[color=magenta!60, inline, size=\small]{ER: #1}}
\newcommand\JF[1]{\todo[color=red!40, inline, size=\small]{JF: #1}}
\newcommand\EB[1]{\todo[color=teal!40, inline, size=\small]{EB: #1}}
\newcommand\dragan[1]{\todo[color=pink!60, inline, size=\small]{DH: #1}}
\newcommand\notenm[1]{\todo[color=purple!40, inline, size=\small]{NM: #1}}
\newcommand\notear[1]{\todo[color=green!20, inline, size=\small]{AR: #1}}
\newcommand\notesb[1]{\todo[color=pink, inline, size=\small]{SLB: #1}}
\newcommand\notess[1]{\todo[color=purple!20, inline, size=\small]{SS: #1}}
\newcommand\noteek[1]{\todo[color=green!20, inline, size=\small]{EK: #1}}
\newcommand\noteaf[1]{\todo[color=pink!20, inline, size=\small]{AF: #1}}
\newcommand\jab[1]{\todo[color=green!40, inline, size=\small]{JAB: #1}}
% notes that have been (hopefully) resolved ... same command with an 'R' at the end, and white

\newcommand\ERR[1]{\todo[color=white!60, inline, size=\small]{ER: #1}}
\newcommand\JFR[1]{\todo[color=white!40, inline, size=\small]{JF: #1}}
\newcommand\noteekR[1]{\todo[color=white!20, inline, size=\small]{EK: #1}}
\newcommand\noteolR[1]{\todo[color=white!40, inline, size=\small]{Ofer: #1}}
\newcommand\EBR[1]{\todo[color=white!40, inline, size=\small]{EB: #1}}
\newcommand\scottR[1]{\todo[color=white!40, inline, size=\small]{Scott: #1}}
\newcommand\ESSR[1]{\todo[color=white!40, inline, size=\small]{ESS: #1}}
\newcommand\ADWR[1]{\todo[color=white!40, inline, size=\small]{ADW: #1}}
\newcommand\TKR[1]{\todo[color=white!40, inline, size=\small]{TK: #1}}
\newcommand\MATR[1]{\todo[color=white!40, inline, size=\small]{MAT: #1}}
\newcommand\AAPR[1]{\todo[color=white!60, inline, size=\small]{AAP: #1}}
\newcommand\PMR[1]{\todo[color=white!40, inline, size=\small]{PM: #1}}
\newcommand\GBR[1]{\todo[color=white!40, inline, size=\small]{GB: #1}}
\newcommand\DGR[1]{\todo[color=white!60, inline, size=\small]{DG: #1}}
\newcommand\DHR[1]{\todo[color=white!60, inline, size=\small]{DH: #1}}
\newcommand\notenmR[1]{\todo[color=white!40, inline, size=\small]{NM: #1}}
\newcommand\notearR[1]{\todo[color=white!20, inline, size=\small]{AR: #1}}
\newcommand\notesbR[1]{\todo[color=white, inline, size=\small]{SLB: #1}}
\newcommand\notessR[1]{\todo[color=white!20, inline, size=\small]{SS: #1}}

\newcommand\red[1]{\textcolor{Red}{#1}}
\newcommand\green[1]{\textcolor{Green}{#1}}
\newcommand\bei{\begin{itemize}}
\newcommand\eei{\end{itemize}}
\newcommand\bee{\begin{enumerate}}
\newcommand\eee{\end{enumerate}}

\newcounter{syscounter}  % So we can continue an enumeration of robustness checks


% Note to MNRAS editor: changing this to a no op removes markup for the changes made for
% the referee report.
%\newcommand\edit[1]{\textcolor{Red}{#1}}
\newcommand\edit[1]{#1}
\newcommand\magenta[1]{\textcolor{Magenta}{#1}}

% allow sets of equations (e.g. using align) to page break
\allowdisplaybreaks

\begin{document}
%\linenumbers

\title{AI and Physics: Spring 2021}

%\input{DES-2017-0226_author_list.tex}
%%%\author{Dark Energy Survey Collaboration}

\date{\today}

%\pubyear{2017}

\maketitle

\section{Computational Imaging}

The speaker this week is Laura Waller, who has made quite a bit of software \href{http://www.laurawaller.com/opensource/}{public}. Doing some of the demos and trying them out on different images would make a good project.

There are several biophysics projects she has been involved in recently, so it makes sense to cover the following topics:
\bei
\item Fluorescence
\item Point Spread Function
\item Phase Imaging
\eei

\subsection{Fluorescence}

\Spng{JablonskiSimple}{Energy level diagram used in fluorescent imaging. Photons with energy $h\nu$ are injected into the sample. One of the electrons in the molecule jumps to an excited level, which is highly unstable. However, the fastest way for the electron to drop in energy is via a transition to a lower energy rotational state (in this case ${}^3A$). Since rotational energies are typically much smaller ($\sim10^{-3}$ eV) than orbital transition energies ($\sim 1$ eV), the photon that is emitted when the electron eventually drops down to the ground state has only slightly lower energy.}

A simple example is depicted in the figure. By pumping laser light into a sample, experimenters can observe emitted radiation at slightly lower energies (typically still in the optical range though). Since the decays happen so rapidly (nsec time scales), it is possible to localize the molecules and track their motion. It's even better than that because biophysicists have figured out ways to take known molecules (called ``fluorescent dyes'') and attach them to components of the cell, such as proteins, nuclei, and lipids. So, any part of the cell can be tracked regardless of its molecular structure.

\subsection{From 2D to 3D}

The images are two-dimensional images, but scientists have learned how to extract information about the third dimension, the distance to the object being studied. There are a number of ways to do this, but one \cite{unknown} uses a specially designed mask to make the point spread function (PSF) depend on the radial distance from the detector. The PSF is a ubiquitous concept in optics with ramifications from biophysics to astrophysics. It simply the statement that the true two-dimensional image is related to the observed 2D image via a convolution:
\be
I^{\rm obs}(x,y) = \int dx\,dy I^{\rm true}(x',y') P(x-x',y-y').\ee
A perfect camera under perfect seeing conditions would have a Dirac delta function PSF so the integral would collapse and the observed image would be equal to the true image. This never happens, so to reconstruct the image, one needs to account for the PSF. In the example shown in Fig.~\rf{phasemask}, samples further away would appear smaller than those closer to us, so we would be able to infer the distance to the sample.

\Spng{phasemask}{The PSF varies depending on the distance to the sample.}

This technique therefore enables biologists to get 4D images of the interior of cells (3 space and 1 time). One more comment: when implementing algorithms numerically, the integral in the PSF equation become as matrix equation:
\be
I^{\rm obs} = P I^{\rm true}\eql{iobitr}
.\ee
As an example, suppose the observed image has a total of $N=100o$ pixels, and you pixelize the true image to have the same number of pixels. Then, both $I$'s are simply vectors with $N$ components. The integration turns into a sum:
\be
I^{\rm obs}_i = \sum_j P_{ij} I^{\rm true}_j
\ee
while the PSF that connects the 2 intensities is an $N\times N$ matrix.
Obtaining the true image from the observed image is the inverse problem, and there are a vast number of ways to do this. Just a heads up that many of these ways involve applying a ``cost function.'' 

\subsection{Phase Determination}

Many objects in the body do not emit or block light very much so are very hard to detect. Waller's group has worked on detecting these unseen objects by measuring their ``transmission function.'' Even if the amplitude of this is one so that very little light is stopped, the light can scatter a bit so that the phase changes and clever  clever design~\cite{bostan2020deep} can extract information about the phases -- and therefore about the tissues say between us and the emitting proteins. 

Light is a traveling wave with electric field oscillating in the direction perpendicular to the direction of propagation. As it passes from one material through another, the wave is multiplied by the transmission function:
\be
E_{\rm after} = t\times E_{\rm before}
.\ee
In biological systems this function --- which is related to the index of refraction -- varies from one position to another. In fact, \ec{iobitr} can be generalized to include the transmission function. The 10,000 foot view is that one can model the observed intensities as a function of the transmission function, the PSF, and the true image and try to get the best fit to real or simulated data. This best fit about the transmission function contains valuable information about the tissues in the body.

\section{AI Basics}

Most problems in AI have some commonalities. Let's use the example of image reconstruction, which is what was discussed in the  \href{https://cmu.zoom.us/rec/share/2m9MSmtj6mbf7A_ueusvKnZIFItFV3HPI7QVJ7zlfSCCZGRzBt3iTHKs_fcKLJKx.3rNWZExDpw63YeTg}{seminar}. Consider a picture with 1000 pixels. The image is defined by a number in each pixel denoting the intensity of the image, so (let's keep things black and white, so there is no need for more than one number) with say 0 meaning the image is dark in that pixel and 255 meaning it is as bright as possible. These 1000 numbers -- the intensities of the image in each pixel -- are called {\bf features}. They are what are used by the AI to infer whatever you want to infer. Different AI problems will have different number of features. 

One image is called a {\bf sample}. Typically, there are many different samples. The more samples you have, the easier it is to build (or {\bf train}) your algorithm. In fact, we usually take a subset of the samples and use them only for training the algorithm. This subset is called the {\bf training set}. Whatever algorithm we design from the training set, we then test on the remaining samples, called the {\bf test set}. If the results are about as good on the test set as on the training set, then we are good to go, and can apply the algorithm to new samples with some confidence as to how well it will perform. You will often hear people talk about the opposite though: {\bf over-training} or, very related {\bf over-fitting} the data. This happens when your algorithm tries to too hard to fit the training data and ends up fitting stuff there that is just noise or weird glitches. Then the algorithm will not work so well on new data.

Much of AI is based on {\it supervised learning}. This is when every sample in the training set, at least, carries with it a {\bf target} or a {\bf label}. The target could be as complex as the the set of features. In the case of an image, it can be the true image. Or it could be something much simpler: e.g., specifying whether the image a cat or a dog.  

However you design your algorithm, you almost have to introduce a way of quantifying how good it is, how well it does what it is supposed to do. This is typically called the {\bf cost function}. In the example of the image, the cost function could be
\be
C = \sum_{j=1}^{N_{samples}}\ \sum_{i=1}^{N_{pixels}} \left( I^{\rm Predicted,j }_i({\rm Features}) - I^{\rm true, j}_i\right)^2.\eql{cost}\ee
Here, the first term in parentheses is the prediction for the intensity of the image in pixel $i$ for sample $j$, which depends on all the features. This prediction is your AI algorithm: it inputs the features and outputs a prediction. The second term is the true image intensity in pixel $i$ for the $j$th sample, which we also call the target. Typically the prediction will depend on a lot of parameters and those parameters will be varied until the cost function is minimized.  

\subsection{Simple Example: Linear Regression}

Let's work out a simple example. Your target is $y$ and you are given a feature $x$. You believe that $y$ and $x$ are linearly related so that your model is
\be
y^{\rm predicted} = mx+b
.\ee
The goal of your algorithm is to take in all the samples you have, each of which contains a feature $x$ and a target $y$, and learn the coefficients of the model, in this case $m$ and $b$. Your cost function will be
\be
C = \sum_{i=1}^{N_{\rm samples}} \left( y_i - [mx_i+b]\right)^2.\eql{costt}\ee
You can actually do this minimization by hand, but generally computers can do things faster and generally the problems are not so easy to solve analytically. But you get the basic idea: you want to determine the parameters of the model by minimizing the cost function.

\section{Photometric Redshifts}

The universe is expanding. This can be inferred from the fact that almost all galaxies are moving away from us and the farther away they are, the faster they are moving. One way to observe how fast galaxies are moving away from us is to observe the shift in energies of photons emitted in atomic transitions. These are called {\it emission lines}. For example, when an electron in hydrogen is in the $n=2$ state and drops down to the $n=1$ state, it emits a photon with an energy equal to 10.2 eV, corresponding to a wavelength equal to 121.5nm. If the atom is moving away from us, then then line will appear at a longer wavelength. I.e., it will be {\it redshifted}. Similarly, if a galaxy has a lot of hydrogen in it, then we will see a peak in emission at $\lambda=121.5\times(1+z)$nm, where $z$ is the redshift of emission.

\Spng{redshift}{Example of a spectrum of a galaxy. All the emission lines have been shifted by a factor of $1+z=3.2$ so the galaxy is said to have a redshift of 2.2.}

In cosmology, things moving away from us faster are farther away, so we can turn this around: obtain a galaxy's redshift and use it to determine how far away the galaxy is. This simple trick has the power to transform a 2D image of the sky to a full three-dimensional image. 

Clearly then, it would be wonderful to obtain spectra of all galaxies in a given survey. E.g., the Dark Energy Survey has images of hundreds of millions of galaxies. a 3D map of all these galaxies would carry an enormous amount of information about the universe (including things like dark energy and dark matter). However, spectra take a long time to measure; the only reason DES has so many images is because they spend so little time on each galaxy. 

\Spng{photoz0}{Magnitudes change for a galaxy at redshift 0}

Instead what DES and other imaging surveys such as the upcoming Vera Rubin Observatory survey) obtains is the flux in a given range of wavelengths, or actually a set of these fluxes in  different wavelengths bands, as depicted in Fig~\rf{photoz0}. Here, what is plotted on top is the relative {\it magnitude}. Magnitude is astronomy is defined such that larger numbers mean fainter objects. E.g., the sun has a magnitude of about -26.7 (very low), and the faintest thing you can see by eye is about +6.
The formula is
\be
m = -2.5\log_{10}({\rm Flux}) + {\rm constant}
.\ee
Surveys find stars and galaxies as faint as 25th magnitude or even fainter. 

\href{https://www.kaggle.com/c/photometric-redshift-estimation-2019}{Here} is a simple illustration of how the magnitudes in each band vary depending on how far away from us that galaxy happens to be. The challenge for photometric redshifts is to take the observed magnitudes in the different bands and estimate the redshift of the galaxy. These are called {\it photometric redshifts}.


\section{Particle Physics, Dark Matter, and the Large Hadron Collider}

\href{https://cmu.zoom.us/rec/share/M26ZDfLR_giq7wdeln1DN4T9mGCBO9OBklY6bBRwkD2ivbNxnB-5sqsqdFGTrjEI.3Ug9tThx9aFomknL}{Link to Lecture; Passcode: DDd*6pN\^}

Particle physics is the culmination of the age-old search for the fundamental constituents of matter. It has been {\bf too} successful over the last 60 years. Every experiment carried out is consistent with the following, which is part of what is now called the Standard Model:
\bei
\item Every material on Earth is comprised of electrons, neutrons, and protons.
\item The electron is a fundamental, point particle
\item Protons and neutrons are each comprised of 3 fundamental particles, quarks. The proton consist of 2 up quarks (each with electric charge $+2/3$) and one down quark (with electric charge $-1/3$). Neutrons are comprised of $udd$.
\item Protons are electrically charged, which means they (and electrons) interact with photons. That means that whenever you have a lot of them, they tend to emit light. 
\item Neutrons do not interact (much) with photons but the quarks inside protons and neutrons are bound together by the {\it strong force}, which is mediated by {\it gluons} (just as the electric force is mediated by photons.
\eei
There are other fundamental particles, but not too many, and some of them are almost copies of these 3 (e.g., the muon is almost identical to the electron except that it is more massive).

In cosmology, there is overwhelming evidence for the presence of massive particles that are not seen, that is, do not emit light. These cannot be protons or electrons (which do emit light) or even free neutrons, because free neutrons decay into protons. This so-called {\it dark matter} cannot consist of any of the fundamental particles in the Standard Model. This is potentially a gift to the field of particle physics, because it is evidence that there is more left to discover. 

The most direct way of discovering new particles is to build large accelerators that collide protons together. `Large' is required to accelerate them to high energy. High energy is needed because these new particles are likely at very large masses, much larger than the proton. When protons collide together they produce lots of particles (e.g., Fig.~\rf{lhcevent}), so not all of the energy goes into a single particle. Roughly, the amount of energy in a collision is 1 TeV = $10^{12}$ eV, and again roughly, they can produce particles only a fraction of that mass. Many heavy particles decay quickly and many are stable for the short time it takes them to leave the detector traveling at close to the speed of light. There are some particles though that might decay on time scales just right so that you could observe a {\it displaced} vertex: that is, an indication of 2 new particles being produced from a point displaced from the center.

The Dark QCD model is one of many (really, many, many!) models for dark matter. It is based on the following observation:
\bei
\item We know how much dark matter is needed to explain observations
\item The total mass density is about 6 times larger than the mass density in protons and neutrons and electrons
\item Build a model wherein the number of dark matter particles is roughly equal to the number of protons but the mass of the dark matter particle is 6 times heavier.
\eei
The Dark QCD model does this by postulating the existence (this means ``make up'') of another type of gluon that binds together dark particles. The stuff in the Dark QCD sector has to interact weakly with the stuff in the Standard Model in order for the masses to be about the same and for the number of them to be about the same. It is this interaction that is then exploited so that in the LHC you have a process like
\be
p + p \rightarrow (SM) + X  \rightarrow (SM) + SM
.\ee
Here the final SM stands for particles in the standard model that might leave tracks in the detectors at the LHC.


\Sfig{lhcevent}{The tracks left in the ATLAS detector by particles produced in a single collision of two energetic protons accelerated in the Large Hadron Collider.}


\section{Bayesian Analysis}

Consider the following simple model for a data point:
\be
D = s + n
\ee
where $D$ is the observed data point, $s$ is the signal you are trying to extract and $n$ is noise. Here, we will assume that the noise is gaussian distributed with mean 0 and variance $\sigma^2$, so
\be
P(n) = \frac{1}{\sqrt{2\pi\sigma^2}}\, e^{-n^2/2\sigma^2}.\ee
But $n=D-s$, so we have the following probability for obtaining the data given a particular value of the signal:
\be
P(D|s) =  \frac{1}{\sqrt{2\pi\sigma^2}} \, e^{-(D-s)^2/2\sigma^2}.\eql{like}\ee
The number $s$ here is what we are trying to determine and so it is a parameter in the model. This conditional probability then tells us how likely it is that we will get $D$ given the parameter value $s$. It is called the {\it likelihood function}.

Bayes' Theorem says we are not really interested in the likelihood; rather we care much more about what the value of the parameter $s$ is given the data. That is, we want
\be
P(s|D).\ee
To get this, you can use an elementary fact from probability theory:
\bea
P(x\cup y) &=& P(x|y) P(y)\vs
&=& P(y|x) P(x) 
\eea
where the probability on the left is the probability that both $x$ and $y$ are true. Equating the two right-hand sides of this and using instead of $x,y$ the things that interest us $D,s$ leads to
\be
P(s|D) = \frac{
P(D|s)}{ P(s)}{P(D)
}
.\ee
This is exactly what we want and it is what you will plotted in physics papers often: the probability distribution of a given parameter. From this you can obtain the best fit value (where $P(s|D)$, called the {\it posterior}) peaks, the region that is excluded at 95\% confidence level, etc. There are two things that connect the posterior to the likelihood, one is trivial and the other is crucial. The trivial thing is the denominator, which seems very hard to predict. Fortunately, we simply don't care about it; it does not depend on the thing we are interested in $s$ and so serves only as a normalization factor. And, anyway, we already know how to normalize probabilities (the integrated probability must be 1). The numerator though is crucial. It is called the {\it prior}. It represents our expectations of what the signal is likely to be {\it before} we obtain the data.

If you don't think the prior is important, consider the following example. Consider the genetic disease, Aceruloplasminemia. It is related to abnormal levels of iron in your brain and you don't want to have it. Symptoms can start occurring when someone reaches the age of 20. One of the symptoms is blurry vision. Suppose, you are experienced blurry vision; that will be the data. You want to know if you have the disease (that will be the signal). Now almost all people with this disease get blurry vision. So, $P(D|s) \simeq 1$. However, the probability that you have the disease $P(s)$ is very small; even in Japan, where it is most prevalent, only 1 in 2 million people have it. Let's use that for an estimate of $P(s)=5\times 10^{-7}$. What about blurry vision? Roughly half the people in the world have it. Therefore, if you get blurry vision, instead of worrying that you have Aceruloplasminemia, do the following calculation:
 \be
 P(s|D) = \frac{5\times 10^{-7} \times 1}{0.5} = 10^{-6}.\ee
 There is one in a million chance you have the disease (for reference, the mortality rate for someone my age is roughly 1\%).
 
 We can generalizes to more than one data point. Consider the case where you take lots of values of $y$, each of which is generated from the same signal and the same noise distribution. So each time the signal will be the same but the noise will be different. Then, the likelihood is simply the product of all $N$ measurements:
 \be
 \mathcal{L} = \frac{1}{(2\pi\sigma^2)^{N/2}}\, \exp\left\{ - \sum_{i=1}^N \frac{(D_i-s)^2}{2\sigma^2} \right\} 
 \ee
To find the best fit value of $s$ simply find the value of $s$ that maximizes the posterior (the likelihood times the prior). Suppose the prior were uniform, so that the posterior was equal to the likelihood. Then, maximizing the posterior would correspond to minimizing the negative of the $\ln$ likelihood or minimizing
\be
\hat s = {\rm arg\ min}_s \left( \sum_{i=1}^N \frac{(D_i-s)^2}{\sigma^2}  \right)
.\ee
In this case, when the noise is drawn from the same distribution for all measurements, maximizing the likelihood then is equivalent to using the cost function we identified in \ec{cost} and \ec{costt}.


\section{Cosmological Simulations}

The goal of cosmological simulations is to start from initial conditions and evolve the system forward using the laws of physics to obtain a universe that looks like ours today. This sounds hard, but we are helped greatly by the fact that most of the mass in the universe is in the form of dark matter. Dark matter has the following properties:
\bei
\item It is likely {\it cold}; i.e., the initial velocities are very small, with no thermal contribution, and there are no relativistic effects
\item They obey Newton's laws of motion
\item The only force that acts on them is gravity ... due to all the other dark matter particles
\eei

It turns out that gravity alone can start from initial conditions and produce a universe that looks pretty similar to our universe. The basic idea is that early on the universe was pretty smooth, so that the density varied from one part of the sky to another by only a factor of 1 part in 10,000 at most. However, the slightly over-dense regions over the course of billions of years attracted more and more matter until eventually the over-densities became nonlinear and large structures such as galaxies, stars, and planets formed.

\subsection{N-Body simulations}

\bee
\item Compute the force on all particles. In principle $\vec F_i = -Gm_i\sum_j \frac{m_j (\vec r_i-\vec r_j)}{\vert \vec r_i-\vec r_j\vert^3}$
\item Update the particles' velocity: $\vec v_{t+1} = \vec v_t + (\vec F_i/m_i)\times \Delta t$
\item Update the particles' positions: $\vec x_{t+1} = \vec x_t + \vec v \times \Delta t$
\item Go back to step 1
\eee
In practice, Step 1 would take order $N^2$ calculations. Typically, $N\simeq 10^{10}$ so this is not feasible. There are a variety of ways around this but the simplest is to use a grid or a mesh and is called Particle-Mesh. There are still $N$ particles, but the system allocates the density of each particle to nearby grid sites. Then each grid site has a density associated with it, and Poisson's equation can be solved by first taking the Fourier transform and then solving the algebraic equation:
\be
k^2\tilde\Phi = 4\pi G \tilde\rho.\ee
Armed with the potential, one can take the inverse Fourier transform and obtain the force on the grid sites and then extrapolate to each particle's position. Since a fast Fourier transform (FFT) takes only $N\ln(N)$ steps, the calculation is no longer $N^2$. The bottom line though is that there is a resolution limit, dictated in part by the grid size.

\subsection{Hydrodynamic Simulations}

To make realistic galaxies, you must include ordinary matter, called {\it baryons}. Usually, then, there are multiple components:
\bee
\item dark matter, relatively easy to track, as above
\item atoms of various types (hydrogen, helium, carbon, etc. in different ionization states
\item Radiation Field
\item stars, also easy to track since they don't interact so can be treated as dark matter. The hard part is designing algorithms for when they form and when they explode.
\eee
The physics of 2 is actually pretty well-known. The reactions that produce different ionization states can be coded in and are temperature-dependent. It depends a lot on 3, though, which is very difficult to model. The baryons -- even if divide into different elements and ionization states -- are driven by known equations. One example of the way baryons are implemented is on a grid, such each cell has a density (of each species), temperature, and velocity. The two fundamental equations of hydrodynamics are the continuity equation 
\be
\dot\rho + \nabla\cdot(\rho\vec v) = 0.
\ee
and the velocity equation:
\be
\frac{\partial\vec v}{\partial t} + \vec v\cdot\nabla \vec v + \frac{\nabla p}{\rho} = \vec g
.\ee
Solving this numerically is very difficult (it's a career) and typically hydro codes can take ten times longer to run than N-Body codes. So a high, resolution hydro cosmological simulation is very expensive (can take 6 months on a supercomputer). Finding ways to learn about high-resolution detail while carrying a low-resolution simulation would clearly be immensely valuable.


\section{(Quantum) Cryptography}

Any message can be delivered in base 2, a series of 0's and 1's. Consider a simple 3-bit message:
\be
M = 0\,0\, 1
\ee
Suppose I want to send this message to Alice but I don't want anybody who intercepts it to be able to understand it. So, we initially exchange a {\it key}:
\be
K = 1\, 0\, 1
.\ee
This is just a random choice but the important thing is that only the two of us know the key. To encrypt my message, I send the bit-added sum of $M$ and $K$:
\be
C = M+K = 1\, 0\, 0
.\ee
Now, Alice gets my message and uses that fact that in this weird bit-addition $X+X=0$ for any $X$, so she computes
\be
C+K = M+ K+K = M
\ee
or
\be
C+K = 0\, 0\,1
\ee
the correct answer!

What is Eve (the eavesdropper) steals our code? Then, there is no way of ensuring our messages will be safe. Quantum cryptography provides a way out. We start by considering 4 states, most easily understandable in terms of the Pauli spin operators. The first 2 states are eigenvalues of the $\sigma_z$ operator and give $\pm1/2$ when you measure the $z$-component of spin. 
\bea
0: && Z:|0\rangle
\vs
1: &&Z:|1\rangle\eea
The 3 ``columns'' here denote: the value of the qbit; the basis; and the state in the $Z$ basis.
Now add two other states that are linear combinations of $\sigma_z$ such that they are eigenstates of $\sigma_x$:
\bea
0: && X:|+\rangle\equiv (1/\sqrt{2})\left( |0\rangle + |1\rangle\right)
\vs
1: && X:|-\rangle\equiv (1/\sqrt{2})\left( |0\rangle - |1\rangle\right)
\eea
The first column is the value of the qbit, so the first line here is a way of transmitting the exact same information (0) as in the first line above. But, in this case, the state is different: it's a linear combination of the $\pm$ z-states. These states are egienstates of the $\sigma_x$ operator, so you get `0` for sure if you measure the first of these and 1 for sure if you measure the second.

Now I will send Alice a state but she needs to know which basis to measure in. E.g., if I send her the first state $|0\rangle$, and she measures in the $X$ basis, there is a 50-50 chance she will incorrectly get 1 instead of 0.Or if I send her $|+\rangle$ and she incorrectly measures in the z-basis, she has a 50-50 chance of (incorrectly getting 1. So, she needs the basis for each bit. What she does is guess. So suppose I sent her:
\be
|+\rangle\, |0\rangle\, |0\rangle
\ee
 and want her to measure in the $X,Z,Z$ basis. If she does, then she will get 0,0,0. She (and nobody else either) though knows what basis to measure in, so she guesses $X,X,X$. She will get the first state right, but only have a 50-50 chance of getting the 2nd and 3rd states right. She calls me up over a public line and tells me what basis she used. I tell her that she can trust only the measurement on the first bit. We can then use that bit (or if we did this for lots more bits, all the bits that I tell her she can use) as our key. Will Eve know our key? No! All she knows is the basis in which one should have measured the first bit. But she does not know the value of this bit. Of course, if she also intercepted our measurement, there is a chance that she will have measured that first bit using the correct basis, but if you do this for lots of bits, clearly she will get some wrong. She will not have the key.
 
 
\section{Dust} 
% \subsection{Extinction}
 
 Dust absorbs and scatters radiation so along a given line of sight, the observed intensity is suppressed by the optical depth
 \be
 I_\lambda = I_{\lambda 0}\,e^{-\tau_\lambda}
 .\eql{dtau}\ee
 Roughly, $\tau_\lambda\sim \int ds n_D\sigma_\lambda$, the integral along the line of sight of the number density of dust multiplied by its cross section, which depends on wavelength. 
 
 The wavelength dependence will be very important; the basic idea is that if the wavelength of light is smaller than the size of the dust grain, then it will feel the full geometric cross-section. For a single grain with a single size, then, the cross-section will become constant as $\lambda\rightarrow 0$ and will fall off at larger wavelengths. There are actually two cross sections here, one for scattering and one for absorption. They both have this same general property but fall of differently at large wavelengths. It is useful to normalize them by the geometric cross section $\pi a^2$ where $a$ is the radius of the grain. Then, focusing on scattering,
 \bea
 \frac{\sigma_{\rm scatter}}{\pi a^2} &=& \begin{cases}
 1 & \lambda \ll 2\pi a\cr
\left( \frac{2\pi a}{\lambda}  \right)^4  & \lambda \gg 2\pi a\cr
\end{cases}
%\vs
%\frac{\sigma_{\rm abs}}{\pi a^2} &=& \begin{cases}
% 1 & \lambda \ll 2\pi a\cr
%\left( \frac{2\pi a}{\lambda}  \right)^2 & \lambda \gg  2\pi a\cr
%\end{cases}.
\eql{dusts}\eea
The scattering cross-section is familiar as Rayleigh scattering: the sky is blue because blue light is scattered by the atmosphere but red light has a much lower cross section so does not and just arrives directly from the Sun.  				
 
 
 However, we do not observe extinction curves that look like this: i.e., when we plot $\tau_\lambda$ (or some other quantity used in its stead) as a function of wavelength, we see the optical depth continue to rise as the wavelength gets shorter and the fall-off at large wavelengths is only $\lambda^{-1}$ instead of $\lambda^{-4}$. Both of these facts speak to the presence of dust grains with a wide variety of sizes. The reason why the cross section continues to increase as the wavelength gets small is that smaller and smaller grains continue to contribute. The reason why the fall-off is much slower than $\lambda^{-4}$ is because very large grains exist and they do scatter even relatively long wavelength photons. 
 
% There are a variety of ways to recast the optical depth. One of the most popular is to recall that magnitudes are defined as
% \be
% m \equiv -2.5\log_{10} (I)\,+ \, {\rm constant}
%  .\ee
%  Therefore the ratio of two intensities (or fluxes) can be written as $I_1/I_2 = 10^{-0.4(m_1-m_2)}$. Therefore, the log of \ec{dtau} is
%  \be
% 0.4\left(  m_{\lambda} - m_{\lambda,0} \right) = 0.434\tau_\lambda
% .\ee
% So the change is magnitude of an object due to extinction is simply
% \be
% \Delta m = 1.086 \tau_\lambda  \equiv A_\lambda
% .\ee
% This is equivalent to the definition
% \be
% A_{\lambda} \equiv -2.5\,\log_{10}\left(\frac{I_{\lambda}}{I_{\lambda 0}}\right)
% .\ee
% The extinction parameter $A_\lambda$ obviously has the same dependence as $\tau_\lambda$ on the dust density, size, and composition along the line of sight. 
% 
% It is useful to separate out the amplitude of $A_\lambda$ (which depends mostly on how much dust there is along the line of sight) and the shape (which depends on the size and composition of the grains). One definition along these lines is 
% \be
% E(B-V) \equiv A_B-A_V
% \ee
% where $B$ and $V$ are the blue ($4400\AA$) and violet ($5500\aa$) filters. Since V-band is sensitive to longer wavelength radiation, we expect its optical depth to be smaller than B-band. Therefore, $A_B$ should be larger than $A_V$, and $E(B-V)$ is positive. The bigger it is, the larger is the effect of reddening.
% 
%Typical extinction curves (e.g. Fig 21.1 in \cite{2011piim.book.....D} and Fig. 7.1 in \cite{2006agna.book.....O}) show that $A_\lambda\rightarrow0$ as $\lambda\rightarrow\infty$ and increases roughly as $\lambda^{-1}$ as one moves to shorter wavelengths. Another way to describe extinction is the ratio of the extinction to the reddening:
%\be
%R_V \equiv \frac{A_V}{E(B-V)}
%.\ee
%Values of $R_V$ range from about 2 to 5 in our Galaxy, with an average of $3.1$. So if a given sight line has very little reddening but lots of extinction, it will have a large value of $R_V$. This ratio does not depend on the amount of dust along the line of sight. Regions with large $R_V$ (little reddening) must absorb about as much red as blue light. This means the large red wavelengths are still smaller than the sizes of the grains, so these regions much have very large grains. In contrast, a region (such as the SMC) with small $R_V$ much have a large difference in scattering of red and blue light, so must be dominated by small grains that do not affect the red light.
%
%There is an observed correlation between the dust and the gas, so that regions with more gas also have more dust. This manifests itself in the extinction relation:
%\be
%A_V = 0.53\,\frac{N_H}{10^{21}\,{\rm cm}^{-2}}
%\eql{avnh}\ee
%where $N_H$ is the {\it column density} of hydrogen, $\int ds n_H$, the integral along the line of sight of the hydrogen density. I've normalized to $10^{21}\,{\rm cm}^{-2}$, because the typical density might be 1 per cm$^3$, extending out hundreds of parsec.
%
%In principle, \ec{avnh} empowers us to infer something about the column density and cross section of dust. Suppose that dust particles have mass density $\rho_D= 1$ gm cm$^{-3}$ and a typical size of 0.1$\mu$m. Then the mass per length squared of dust is of order $4\pi N_D\rho_D a^3/3$ where $N_D$ is the dust column density. Meanwhile the  absorbtion cross section in the V-band might be taken to be $\pi a^2 $. So we then expect
%\bea
%\tau_V &\simeq & A_V \simeq 
%N_D\sigma_D
%\vs
%&=&
%N_D\pi a^2\,
%\vs
%&=&
%\frac{N_D}{3.2\times10^{9}\, cm^{-2}}
%.\eea
%This means that
%\be
%N_D \simeq 1.7\times10^{-12} N_H.
%\ee
%But the mass of a single one of these grains is much larger than a hydrogen atom: $M_D=4\pi \rho_D a^3/3=4\times 10^{-15}$ gm or $2.5\times 10^9$ larger than the proton mass. This means that the total mass in dust is about 0.2\% that in hydrogen.
%
%This simple estimate suggests that dust has a much lower density than hydrogen (by a factor of about $10^{12}$) but grains might typically be much larger, with masses a billion times larger.

%
%Fig.~23.11 in \cite{2011piim.book.....D} shows that when $\lambda$ is of order $0.55\mu$m [i.e., the V-band], the extinction cross-section is of order $5\times 10^{-22}$ cm$^2$. Therefore,
%\bea
%\tau_V &=& N_D\sigma_V 
%\vs
%&\simeq& 0.5 \frac{N_H}{10^{21}\,{\rm cm}^{-2}}
%\vs
%&=&
%N_D 5\times 10^{-22} {\rm cm}^2
%\eea
%which means that the dust column density is of order
%\be
%N_D \simeq N_H
%\ee
%
%
%The reddening is also correlated with the hydrogen column density, observed to follow
%\be
%E(B-V) = 0.17\,\frac{N_H}{10^{21}\,{\rm cm}^{-2}}
%\ee
%(which of course could be inferred from the average from of $R_V$). 
%
% 
% \section{Detecting Dust: Quantifying Extinction}
 Two historical places where this mattered:
 The Hubble constant was vastly over-estimated (Fig.~\rf{hubble}) because distances were underestimated.
 \Sjpg{hubble}{Hubble's initial estimate; the slope of the line is about 500 km/sec/Mpc. Note that the y-axis is mis-labeled, but more importantly this estimate is wrong by about a factor of 7.}
 And then, there is the sad story of BICEP. Andre Linde, one of the founders of inflation, was one of many physicists who were  \href{https://www.youtube.com/watch?v=ZlfIVEy_YOA}{led to believe} that cosmologists had discovered gravitational waves hiding in subtle polarization patterns in the microwave radiation. It turned out to be dust from our galaxy.
 
 
 Consider three methods:
 \begin{itemize}
 \item{\bf Pair Method} Observe two stars of the same type that therefore have the same intrinsic spectrum. One of the stars must have negligible obscuration along the line of sight. The differences between the two spectra empower us to determine the extinction along the line of sight to the second star.
 \item{\bf Line Ratios}
 Measurements of the fluxes from lines, especially those produced from the same upper atomic levels, can be predicted very precisely and are mostly independent of the physical conditions such as temperature and density. One example is $[S \,II]$, singly ionized silicon. The $^2P$ state can decay into either the $^4S$ state with a wavelength of about $0.41\mu$m or to the $^2D$ state with a wavelength of $1.03\mu$m. The latter is very long wavelength so there is little extinction, so the extinction close to the B and V bands can be inferred from the ratios of these lines. Technically, measuring a line with very long wavelength is challenging because of limitations of CCS's and atmospheric emission lines getting in the way.
 
 Easier then is to stick with hydrogen; e.g., starting from $n=7$ and measuring the ratio of the $7\rightarrow 3$ line (Paschen) with $7\rightarrow 2$ (Balmer), one again at roughly a micron and the other at about $0.4\mu$m. These are better because they are stronger but suffer the same issue of long wavelength. Ratios of Balmer lines (e.g., $H\alpha/H\beta$ or $H\beta/H\gamma$) can be seen from Table 4.1 in \cite{2006agna.book.....O} to be relatively insensitive to the thermal properties of the region and are safely in the optical. So these get a single number that quantifies the overall level of extinction. Finally, one can use the ratio of the flux in a line such as $H\beta$ with the free-free emission in the long wavelength regime, where dust does not absorb the radiation.
 
 \item{\bf Morphology}
 If we are interested in HII regions, then there is an interesting physical effect that offers insight into the amount of dust present: light from the star scatters off dust grains and propagates to us. That light has a different morphology as a function of distance from the star than does the emission from the gas itself that we have focused on earlier. 
 
 As a simple example, consider a spherical region with a central star ionized out to its Stromgren radius $r_s$. Then the flux that we see along a given line of sight for $H\beta$ say is
 \be
 I_{H\beta}(d) = 2\int_0^{r_{max}} j_{H\beta}\, ds
 \ee
 where $d$ is the projected distance between the line of sight and the star; $s$ is the distance along the line of sight; $r_{\rm max}$ is the largest value of $s$ within the Stromgren sphere, apparently set by $r_s^2=r_{\rm max}^2+d^2$; and the factor of 2 accounts for the integral from $-r_{\rm max}$ to zero. If the medium is uniform within the Stromgren sphere in temperature, density, and ionization, then the integral is trivial, and
 \be
 I_{H\beta}(d) = 2 j_{H\beta} \sqrt{r_s^2-d^2}
 .\ee
 On the other hand, the emission from reflected starlight falls off the further one gets from the star. The emission per volume per time  per solid angle is
 \be
 j_\nu = F_*\,\frac{n_D \sigma_\nu}{4\pi}
 \ee
 where $n_D$ is the number density of dust grains and $\sigma_\nu$ is the cross section for dust to scatter the radiation at that frequency. The flux from the star falls off as $r^2$, so the intensity observed along a line of sight will be
 \bea
 I_{D} &=& 2\,\frac{n_D \sigma_\nu}{4\pi}\,\int_0^{r_{\rm max}}\, ds\, \frac{L_*}{4\pi(s^2+d^2)}
\vs
&=&
\frac{n_D \sigma_\nu L_*}{8\pi^2 d}\,
\tan^{-1}\left(\frac{s}{d}\right)\Bigg\vert_0^{r_{\rm max}}\vs
&=&
\frac{n_D \sigma_\nu L_*}{8\pi^2 d}\,
\tan^{-1}\left(\frac{\sqrt{r_s^2-d^2}}{d}\right).
 \eea
 
 Expressed in terms of the angular distance from the center of the HII region, these two signals scale as
 \bea
 I_{H\beta} &\propto& \sqrt{\theta_s^2-\theta^2}
 \vs
 I_{D} &\propto& \frac{\tan^{-1}\sqrt{\theta_s^2/\theta^2 - 1}}{\theta}
 .\eea
 These two different morphologies are depicted in Fig.~\rf{morph} and apparently offer an opportunity to extract information about the density and cross section of dust.
 
 \Sfig{morph}{The morphology of two signals from an HII region: emission by gas and scattered starlight by dust.}
 
 
 \end{itemize}

 \section{Convolutional Neural Networks}
 
 \bee
 \item {\bf Traditional neural network}, each pair of layers connected by a weight matrix $W_{ia}$ where $i$ ranges over all possible nodes of the output and $a$ all nodes in the input. E.g., if the input is a 2D image, the nodes might be the value of the intensity (or even 3 values for 3 colors) in each pixel. Let's use 1D as an example. Call the input $I$ and the output $O$. Then,
 \be
 O_i = \sum_a W_{ia} I_a
 .\ee
 If there are the same number of inputs and outputs, $N$, then the number of operations need to determine the full set of outputs is $N^2$. There are also $N^2$ elements in the weight matrix that need to be optimized (for example by minimizing a cost function that uses the final output layer). 
 \item {\bf Convolution:} This involves a very specific form of the weight matrix. Again assuming the number of inputs and outputs is the same:
 \be
 W_{ia} = \tilde W_{i-k} \qquad {\rm unless}\, |i-a|<k.\ee
 Here $k$ is typically a very small integer
 \item equivariance, convolution preserves outputs under translation but not rotation. Need tricks like ``learned invariances''
 \eee
 
\bibliography{refs}
 
 \appendix
 
  
 \section{AI Notes}\label{sec:unblind}
 This section will have some random notes about the seminars, focusing on the AI methods. 
 
 Some basic terms:
 \bee
 \item Cost Function
 \item Features
 \item Target
 \item Linear Regression
 \item Likelihood
 \item Posterior
 \item Bayesian Statistics
 \item Prior
 \item Classifier
 \item Regularization
 \item Convolution
 \item Neural Network
  \eee
  
  ABCD Method: E.g., we've written a document and want to know how many typos are in it. Different people read it and report $N_i$ number of typos. Take 2 people. Break up each one into those in common and those not:
  \bea
  N_1&=&A+B\vs
  N_2&=&A+C
  \eea
  We know how many person 1 missed and person 2 missed; we want to figure out how many both missed. If they are uncorrelated, $(B/A)\times C$ is the probability that person 1 finds typos times the amount he found. This is also called the fake-factor method.
 
 \section{Links to seminars}
 
 \bei
\item  \href{https://cmu.zoom.us/rec/share/2m9MSmtj6mbf7A_ueusvKnZIFItFV3HPI7QVJ7zlfSCCZGRzBt3iTHKs_fcKLJKx.3rNWZExDpw63YeTg}{Physics-constrained Computational Imaging, Laura Waller} Passcode: 0Zy@h+G6
 \item \href{https://cmu.box.com/s/s16xxmvut99n1fl73yooxq7kwxe6sgfo}{Karri DiPetrillo}, LHC Searches
\item \href{https://cmu.zoom.us/rec/share/tHqLS0TsN0H399MCClI3fv2_dWaraXiiAAA5R3grrEWrA-4OZjclNI4GP_njf_TW.mPFln3zm5t0xyHP7}{Bayes Statistics}, Class Recording;  Passcode: Ta\^EEV2w 
\eei
\end{document}