\documentclass{article} \usepackage{amsfonts} \usepackage{amsmath} \usepackage{natbib} \usepackage{url} %\VignetteEngine{knitr::knitr} % knitr minimal example recommends \usepackage[T1]{fontenc} \newcommand{\real}{\mathbb{R}} \DeclareMathOperator{\logit}{logit} \newcommand{\opand}{\mathbin{\rm and}} \newcommand{\opor}{\mathbin{\rm or}} \newcommand{\inner}[1]{\langle #1 \rangle} \newcommand{\set}[1]{\{\, #1 \,\}} \DeclareMathOperator{\con}{con} \DeclareMathOperator{\pos}{pos} \DeclareMathOperator{\vhull}{span} \newcommand{\REVISED}{\begin{center} \LARGE REVISED DOWN TO HERE \end{center}} \newcommand{\MOVED}[1][equation]{\begin{center} [#1 moved] \end{center}} \begin{document} \section{Confidence Intervals for Mean Value Parameters} For complete separation example of \citet[Section~6.5.1]{agresti}, we need confidence intervals. The theory in \citet{geyer-gdor} says a $100 (1 - \alpha)\%$ confidence region for the parameter is the set of all parameter values that put at least probability $\alpha$ on the observed data vector. This is valid only for the ``complete separation'' case. See \citet{geyer-gdor} and \citet{eck-geyer} for when ``complete separation'' does not hold. The probability in question is $$ \prod_{i = 1}^n p_i^{y_i} (1 - p_i)^{1 - y_i} $$ but the computer does not like $0^0$ so we change this to $$ \left( \prod_{\substack{i \in N \\ y_i = 1}} p_i \right) \left( \prod_{\substack{i \in N \\ y_i = 0}} 1 - p_i \right) $$ where $N$ is the index set for $y$ and $\theta$, $N = \{ 1, \ldots, n \}$. To avoid underflow, we take logs. \begin{equation} \label{eq:constraint} \left( \sum_{\substack{i \in N \\ y_i = 1}} \log(p_i) \right) + \left( \sum_{\substack{i \in N \\ y_i = 0}} \log(1 - p_i) \right) \end{equation} We want to consider this a function of the submodel parameter $\beta$ (called ``coefficients'' by R). The relation is $$ \theta = M \beta $$ where $M$ is the model matrix, and $$ p = \logit^{- 1}(\theta) $$ where this inverse logit function operates componentwise \begin{alignat*}{2} p_i & = \frac{e^{\theta_i}}{1 + e^{\theta_i}} & = \frac{1}{1 + e^{- \theta_i}} \\ 1 - p_i & = \frac{1}{1 + e^{\theta_i}} & = \frac{e^{- \theta_i}}{1 + e^{- \theta_i}} \end{alignat*} for all $i$. Taking logs gives \begin{alignat*}{2} \log(p_i) & = \theta_i - \log(1 + e^{\theta_i}) & & = - \log(1 + e^{- \theta_i}) \\ \log(1 - p_i) & = - \log(1 + e^{\theta_i}) & & = - \theta_i - \log(1 + e^{- \theta_i}) \end{alignat*} We want to maximize and minimize components of $p$ over the region where \eqref{eq:constraint} is at least $\log(\alpha)$. Actually, it may be more computationally stable to maximize and minimize components of $\theta$ over the same region. Since $p$ is a componentwise monotone function of $\theta$, both amount to the same thing. For any quantity $Q$ we have the chain rule $$ \frac{\partial Q}{\partial \beta_j} = \sum_{i = 1}^n \frac{\partial Q}{\partial \theta_i} \frac{\partial \theta_i}{\partial \beta_j} = \sum_{i = 1}^n \frac{\partial Q}{\partial \theta_i} m_{i j} $$ where $m_{i j}$ are the components of the model matrix $M$. So it suffices to worry about derivatives with respect to the $\theta$'s. First \begin{align*} \frac{\partial p_i}{\partial \theta_i} = p_i (1 - p_i) \end{align*} and $\partial p_i / \partial \theta_j = 0$ for $i \neq j$. Let $g(\beta)$ denote the value of \eqref{eq:constraint}. Then in case $y_i = 1$ we have $$ \frac{\partial g(\beta)}{\partial \theta_i} = \frac{\partial \log(p_i)}{\partial \theta_i} = 1 - p_i $$ and in case $y_i = 0$ we have $$ \frac{\partial g(\beta)}{\partial \theta_i} = \frac{\partial \log(1 - p_i)}{\partial \theta_i} = - p_i $$ Make the data. <>= x <- seq(10, 90, 10) x <- x[x != 50] y <- as.numeric(x > 50) @ Try to fit the data. <>= gout <- glm(y ~ x, family = binomial, x = TRUE) summary(gout) @ Code up these functions. <>= modmat <- gout$x f <- function(beta, k, ...) { stopifnot(is.numeric(beta)) stopifnot(is.finite(beta)) stopifnot(length(beta) == ncol(modmat)) stopifnot(is.numeric(k)) stopifnot(is.finite(k)) stopifnot(length(k) == 1) stopifnot(as.integer(k) == k) stopifnot(k %in% 1:nrow(modmat)) theta <- modmat %*% beta ifelse(y == 1, theta, - theta)[k] } df <- function(beta, k, ...) { stopifnot(is.numeric(beta)) stopifnot(is.finite(beta)) stopifnot(length(beta) == ncol(modmat)) stopifnot(is.numeric(k)) stopifnot(is.finite(k)) stopifnot(length(k) == 1) stopifnot(as.integer(k) == k) stopifnot(k %in% 1:nrow(modmat)) ifelse(y == 1, 1, -1)[k] * as.vector(modmat[k, ]) } @ OK. We are ready to test $f$ and $d f$. Let us make up some points at which to test it. <>= n <- length(x) p <- length(gout$coefficients) beta.test <- rep(0, p) library(numDeriv) df(beta.test, 1) grad(f, beta.test, k = 1) for (i in 1:n) print(all.equal(df(beta.test, i), grad(f, beta.test, k = i))) for (j in 1:5) { beta.test <- rnorm(p) * 0.2 for (i in 1:n) print(all.equal(df(beta.test, i), grad(f, beta.test, k = i))) } @ Seems to be OK. Now for $g$ and $d g$. <>= g <- function(beta, alpha, ...) { stopifnot(is.numeric(beta)) stopifnot(is.finite(beta)) stopifnot(length(beta) == ncol(modmat)) stopifnot(is.numeric(alpha)) stopifnot(length(alpha) == 1) stopifnot(0 < alpha && alpha < 1) eta <- modmat %*% beta logp <- ifelse(eta < 0, eta - log1p(exp(eta)), - log1p(exp(- eta))) logq <- ifelse(eta < 0, - log1p(exp(eta)), - eta - log1p(exp(- eta))) logpboundary <- ifelse(y == 1, logp, logq) sum(logpboundary) - log(alpha) } dg <- function(beta, alpha, ...) { stopifnot(is.numeric(beta)) stopifnot(is.finite(beta)) stopifnot(length(beta) == ncol(modmat)) stopifnot(is.numeric(alpha)) stopifnot(length(alpha) == 1) stopifnot(0 < alpha && alpha < 1) theta <- modmat %*% beta pp <- ifelse(theta < 0, exp(theta) / (1 + exp(theta)), 1 / (1 + exp(- theta))) qq <- ifelse(theta < 0, 1 / (1 + exp(theta)), exp(- theta) / (1 + exp(- theta))) # apparently R function auglag wants the jacobian of # the inequality constraints to be a matrix # in this case since g returns a vector of length 1 # this function should return a 1 by p matrix result <- ifelse(y == 1, qq, - pp) %*% modmat dimnames(result) <- NULL result } @ <>= alpha.test <- 0.05 beta.test <- rep(0, p) library(numDeriv) dg(beta.test, alpha.test) grad(g, beta.test, alpha = alpha.test) all.equal(dg(beta.test, alpha.test), rbind(grad(g, beta.test, alpha = alpha.test))) for (j in 1:5) { beta.test <- rnorm(p) * 0.2 print(all.equal(dg(beta.test, alpha.test), rbind(grad(g, beta.test, alpha = alpha.test)))) } @ Everything looks good. Let's find confidence limits. <>= library(alabama) beta.start <- c(0, 0) bounds <- rep(NA_real_, length(y)) for (i in seq(along = bounds)) { aout <- auglag(beta.start, f, df, g, dg, control.outer = list(trace = FALSE), k = i, alpha = 0.05) if (aout$convergence == 0) bounds[i] <- aout$value } bounds @ Fix up \texttt{bounds} so they correspond to what we are interested in. <>= bounds <- ifelse(y == 1, bounds, - bounds) bounds bounds.lower.theta <- ifelse(y == 1, bounds, -Inf) bounds.upper.theta <- ifelse(y == 1, Inf, bounds) data.frame(x, y, lower = bounds.lower.theta, upper = bounds.upper.theta) bounds.lower.p <- 1 / (1 + exp(- bounds.lower.theta)) bounds.upper.p <- 1 / (1 + exp(- bounds.upper.theta)) data.frame(x, y, lower = bounds.lower.p, upper = bounds.upper.p) @ Now make the plot. <>= par(mar = c(4, 4, 0, 0) + 0.1) plot(x, y, axes = FALSE, type = "n", xlab = expression(x), ylab = expression(mu(x))) segments(x, bounds.lower.p, x, bounds.upper.p, lwd = 2) box() axis(side = 1) axis(side = 2) points(x, y, pch = 21, bg = "white") @ Our Figure~\ref{fig:one} is Figure 2 in Eck and Geyer (submitted). It is also the second figure in the talk (first is scatterplot of data). \begin{figure} \begin{center} <>= <> @ \end{center} \caption{One-sided 95\% confidence intervals for mean value parameters. Bars are the intervals. Vertical axis is the probability of observing response value one when the predictor value is $x$. Solid dots are the observed data.} \label{fig:one} \end{figure} \section{Support of Submodel Canonical Statistic} The course notes \citet{infinity} show how to visualize the support of the submodel canonical statistic $M^T y$ where $M$ is the model matrix of an exponential family GLM and $y$ is the response vector. We dump that in here to get that figure. There is a lot of unnecessary blather here that is copied verbatim from those notes. We apologize for that but don't want to edit it for this. We only want the figure (Figure~\ref{fig:two-hyperplane} below) for the talk. We will use the theory of Barndorff-Nielsen completions of exponential families from \citet{geyer-gdor}. \subsection{Support Points} That theory tells us that we must look at the set of all possible values of the canonical statistic $M^T y$ where $M$ is the model matrix and $y$ is the response vector. For the model, $M$ has two columns: the first column is all ones (the ``intercept'' column) and the second column is $x$. So let's find that set. There are $2^n$ possible values where $n$ is the dimension of the response vector because each component of $y$ can be either zero or one. The following code makes all of those vectors. <>= yy <- NULL n <- length(y) for (i in 1:n) { j <- 2^(i - 1) k <- 2^n / j / 2 yy <- cbind(rep(rep(0:1, each = j), times = k), yy) } @ <>= head(yy) dim(yy) @ For those who know how to count in binary, row $i$ is $i - 1$ expressed in binary. Have you heard the joke: there are two kinds of people in this world, those who divide everything into two kinds and those who don't? And its nerd version: there are 10 kinds of people in this world, those who know binary and those who don't? For those who don't, the following code shows that every row of \texttt{yy} is different, every row contains only zeros and ones, and there are $2^n$ rows. <>= fred <- apply(yy, 1, paste, collapse = "") head(fred) length(unique(fred)) == length(fred) all(apply(yy, 1, function(x) all(x %in% 0:1))) nrow(yy) == 2^n @ But there are not so many distinct values of the submodel canonical statistic. <>= m <- cbind(1, x) mtyy <- t(m) %*% t(yy) t1 <- mtyy[1, ] t2 <- mtyy[2, ] t1.obs <- sum(y) t2.obs <- sum(x * y) @ \begin{figure} \begin{center} <>= t.key <- paste(t1, t2, sep = ":") t.idx <- match(unique(t.key), t.key) t1.uniq <- t1[t.idx] t2.uniq <- t2[t.idx] par(mar = c(4, 4.5, 0, 0) + 0.1) plot(t1.uniq, t2.uniq, xlab = expression(t[1] == sum(y)), ylab = expression(t[2] == sum(x * y))) points(t1.obs, t2.obs, pch = 19) @ \end{center} \caption{Possible values of the submodel canonical statistic vector $M^T y$ for the data shown in Figure~\ref{fig:one}. Solid dot is the observed value of the submodel canonical statistic vector.} \label{fig:two} \end{figure} \noindent Figure~\ref{fig:two} shows these possible values of the submodel canonical statistic. And now we are stuck. Figure~\ref{fig:two} seems to show that the observed data vector is an extreme value, but we cannot easily figure out the direction of recession. \subsection{Tangent Vectors} Vectors $Y(\omega) - y$, where $y$ is the observed value of the canonical statistic vector and $Y(\omega)$ are other possible values of the canonical statistic vector, are called \emph{tangent vectors} \citep[explains the reason they have this name]{geyer-gdor}. If $$ V = \set{ v_i : i \in I } $$ is the set of tangent vectors, then the set of a nonnegative combinations of them, vectors of the form $$ \sum_{i \in A} a_i v_i $$ where $A$ is a finite set and $a_i \ge 0$ for all $i$, is called the \emph{tangent cone}. It is denoted $\con(\pos T)$ in \citet{geyer-gdor}. Load library \texttt{rcdd} <>= library(rcdd) @ Figure~\ref{fig:tangent-cone} shows the tangent vectors and tangent cone. The points in Figures~\ref{fig:two} and~\ref{fig:tangent-cone} are the same except in Figure~\ref{fig:tangent-cone} they are moved so the one corresponding to the observed value of the canonical statistic is the origin $(0, 0)$. The gray area is the tangent cone (set of all nonnegative combinations of tangent vectors). \begin{figure} \begin{center} <>= par(mar = c(4, 4, 0, 0) + 0.1) plot(t1 - t1.obs, t2 - t2.obs, axes = FALSE, type = "n", xlab = expression(t[1] - t[1]^obs), ylab = expression(t[2] - t[2]^obs)) foo <- makeV(rays = cbind(t1.uniq - t1.obs, t2.uniq - t2.obs)) bar <- redundant(foo) baz <- bar$output[ , -(1:2)] usr <- par("usr") max.s1.x <- max(usr[1:2] / baz[1, 1]) max.s1.y <- max(usr[3:4] / baz[1, 2]) max.s1 <- min(max.s1.x, max.s1.y) max.s2.x <- max(usr[1:2] / baz[2, 1]) max.s2.y <- max(usr[3:4] / baz[2, 2]) max.s2 <- min(max.s2.x, max.s2.y) xp <- c(0, baz[1, 1] * max.s1, usr[1], usr[2], usr[2], baz[2, 1] * max.s2, 0) yp <- c(0, baz[1, 2] * max.s1, usr[3], usr[3], usr[4], baz[2, 2] * max.s2, 0) polygon(xp, yp, border = NA, col = "gray") box() axis(side = 1) axis(side = 2) points(t1.uniq - t1.obs, t2.uniq - t2.obs) @ \end{center} \caption{Tangent vectors and tangent cone for data shown in Figure~\ref{fig:one}. Dots are tangent vectors, gray region is tangent cone.} \label{fig:tangent-cone} \end{figure} We are interested in the case where a finite subset of the tangent vectors gives the same tangent cone, that is, when $S$ is a finite subset of $T$ such that $\con(\pos S) = \con(\pos T)$. This is obviously the case, when the statistical model has finite support so $T$ is finite, as in logistic regression and log-linear models for contingency tables with multinomial or product multinomial sampling. As we shall see, it is also the case for Poisson regression with log link and for log-linear models for contingency tables with Poisson sampling. For generalized linear models (GLM) we do not need all the tangent vectors. For the saturated model, tangent vectors $Y(\omega) - y$ such that $Y(\omega)$ and $y$ differ only in one coordinate are enough to generate the whole tangent cone \citep[Section~3.11]{geyer-gdor}. Moreover, if $V_{\text{sat}}$ is a set of vectors generating the tangent cone for the saturated model, then $$ V_{\text{sub}} = \set{ M^T v : v \in V_{\text{sat}} } $$ is a set of vectors generating the tangent cone for the canonical affine submodel with model matrix $M$ \citep[Section~3.10]{geyer-gdor}. So now we need to learn how to find these tangent vectors for the saturated model. First consider logistic regression when $y$ has Bernoulli components (zero-or-one-valued). If the observed value of $y_i$ is 0, the only other possible value is 1, so the vector $e_i$, which has all coordinates equal to 0 except the $i$-th component, which is 1, is a tangent vector. Similar reasoning says $- e_i$ is a tangent vector if $y_i = 1$. Second consider logistic regression when $y$ has binomial components. Now we have not only components $y_i$ of the response vector but sample sizes $n_i$ that go with them. We see that if $y_i = 0$, then $e_i$ is a tangent vector (as before), and if $y_i = n_i$, then $- e_i$ is a tangent vector (as before), but now we also have the case $0 < y_i < n_i$ in which case it is possible to change the $i$-th coordinate either up or down, so both $e_i$ and $- e_i$ are tangent vectors. Third consider Poisson regression with log link. Just like in the binomial case we have $e_i$ is a tangent vector when $y_i = 0$, and both $e_i$ and $- e_i$ are tangent vectors when $0 < y_i < \infty$. Since there is no upper bound to the range of a Poisson random variable, there is no case where only $- e_i$ is a tangent vector. \subsection{Calculating the Linearity} \label{sec:linearity} We now want to calculate a GDOR, but that calculation proceeds in two steps in the algorithm of \citet{geyer-gdor}. First we need to find the \emph{linearity} of the tangent cone \citep[Section~3.12]{geyer-gdor}, which is the smallest vector subspace contained in the tangent cone, although \citet{geyer-gdor} also (somewhat sloppily) uses the same term for a set of vectors spanning this vector subspace. If $V_{\text{sub}}$ is a set of vectors generating the tangent cone for the canonical affine submodel, then there is an R function \texttt{linearity} in the R package \texttt{rcdd} that calculates \begin{equation} \label{eq:linearity} L_{\text{sub}} = \set{ v \in V_{\text{sub}} : - v \in \con(\pos V_{\text{sub}}) }. \end{equation} This is the set of all the given tangent vectors that are in the linearity of the tangent cone. They also span it, hence determine it. If we use $L_{\text{sub}}$ as defined in \eqref{eq:linearity} to denote a set of tangent vectors. Then the linearity considered as a vector space is denoted $\vhull L_{\text{sub}}$. The linearity is useful for three reasons. The hyperplane $H$ defined that supports the limiting conditional model (LCM), which is defined in Theorem~6 in \citet{geyer-gdor} and the discussion following it, can be expressed as $H = y + \vhull L_{\text{sub}}$. So the linearity tells us the support of the LCM. We also need to know what the linearity is in order to calculate a GDOR. Finally, the linearity tells whether the MLE exists or not. It exists if and only if $L_{\text{sub}} \neq V_{\text{sub}}$ \citep[Theorem~4]{geyer-gdor}. So let us calculate the linearity for our example, the data shown in Figure~\ref{fig:one}. We follow Section~4.1 of \citet{tr672}. <>= tanv <- m tanv[y == 1, ] <- (-tanv[y == 1, ]) vrep <- makeV(rays = tanv) lout <- linearity(d2q(vrep), rep = "V") lout @ $M^T e_i$ is just the $i$-th row of $M$, so the rows of \texttt{m} are either tangent vectors or $- 1$ times tangent vectors. So we assign \texttt{tanv} to be \texttt{m} and then adjust the signs. For rows of \texttt{m} such that corresponding component of \texttt{y} is equal to one, we need to change the sign. So the second and third lines of the code chunk above make \texttt{tanv} a matrix whose rows are the elements of $V_{\texttt{sub}}$. Then next two lines are idiomatic usage of the R package \texttt{rcdd}. The result \texttt{lout} is an integer vector giving the indices of the tangent vectors in the linearity, that is, \verb@tanv[linearity, ]@ is a basis for the linearity considered as a vector subspace. Here the result is a vector of length zero, which says the empty set of vectors spans the linearity, which means it is the trivial vector subspace $\{0\}$ that has only one point. We could actually see this in Figure~\ref{fig:tangent-cone}, the gray area is a pointed cone, so it contains only the trivial subspace. So this tells us that the support of the LCM for this example contains only one point. The MLE distribution is completely degenerate, concentrated at $y$. The MLE distribution says the only data we could have observed is what we did observe; no other data values were possible. Before anyone decides this is weird, let me remind you this is only an estimate, and, as always, estimates are not parameters. This degeneracy causes no problem so long as we don't overinterpret it. This complete degeneracy of the MLE distribution is what Agresti calls ``complete separation.'' \subsection{Calculating Generic Directions of Recession} \label{sec:gdor} If $L_{\text{sub}} \neq V_{\text{sub}}$ the MLE does not exist in the original model (OM), and we need to calculate a GDOR. In this we follow \citet[Section~3.13]{geyer-gdor}. A vector $\eta$ in the parameter space is a GDOR if and only if \begin{subequations} \begin{align} \inner{v, \eta} = 0, & \qquad v \in L_{\text{sub}} \label{eq:linear} \\ \inner{v, \eta} < 0, & \qquad v \in V_{\text{sub}} \setminus L_{\text{sub}} \label{eq:nonlinear} \end{align} \end{subequations} and we can find one such $\eta$ by solving the following linear program \begin{equation} \label{eq:linear-program} \begin{split} & \text{maximize} \\ & \qquad \varepsilon \\ & \text{subject to} \\ & \qquad \begin{aligned} \varepsilon & \le 1 \\ \inner{v, \eta} & = 0, & & \qquad v \in L_{\text{sub}} \\ \inner{v, \eta} & \le - \varepsilon, & & \qquad v \in V_{\text{sub}} \setminus L_{\text{sub}} \end{aligned} \end{split} \end{equation} where $\eta$ is a vector, $\varepsilon$ is a scalar, and $(\eta, \varepsilon)$ denotes a vector of length one more than the length of $\eta$. This vector is the vector of variables of the linear program. The $\eta$ part of the solution is a generic direction of recession. The $\varepsilon$ part does not matter. So we solve this linear program to calculate the GDOR, still following Section~4.1 of \citet{tr672}. <>= p <- ncol(tanv) hrep <- cbind(0, 0, -tanv, -1) hrep <- rbind(hrep, c(0, 1, rep(0, p), -1)) objv <- c(rep(0, p), 1) pout <- lpcdd(d2q(hrep), d2q(objv), minimize = FALSE) names(pout) pout$solution.type gdor <- q2d(pout$primal.solution[1:p]) gdor @ The code chunk above is not general. It assumes the linearity is trivial, as in the particular example we are working on. More on this later. So now we have a GDOR, we should put that on the plot, but we cannot. The reason is that $\eta$ is a vector in the parameter space (as we have been saying over and over), but the space plotted in Figure~\ref{fig:two} is the sample space for the canonical statistic vector. (I tried. There is no way to draw $\eta$ into Figure~\ref{fig:two}.) What we can do is add the hyperplane \begin{equation} \label{eq:hyperplane-lcm} H = \set{ x \in \real^2 : \inner{x, \eta} = \inner{y, \eta} } \end{equation} See Figure~\ref{fig:two-hyperplane}. \begin{figure} \begin{center} <>= par(mar = c(4, 4.25, 0, 0) + 0.1) plot(t1.uniq, t2.uniq, xlab = expression(t[1] == sum(y)), ylab = expression(t[2] == sum(x * y))) points(t1.obs, t2.obs, pch = 19) slop <- (- gdor[1] / gdor[2]) intr <- t2.obs - t1.obs * slop abline(intr, slop) @ \end{center} \caption{Possible values of the submodel canonical statistic vector $M^T y$ for the data shown in Figure~\ref{fig:one}. Solid dot is the observed value of the submodel canonical statistic vector. Solid line is the hyperplane \eqref{eq:hyperplane-lcm} on which the LCM is concentrated.} \label{fig:two-hyperplane} \end{figure} The fact that the only possible value of the canonical statistic vector that is on $H$ is the observed value $y$ again tells us that the LCM is completely degenerate, concentrated at the observed value. \begin{thebibliography}{} \bibitem[Agresti(2013)]{agresti} Agresti, A. (2013). \newblock \emph{Categorical Data Analysis}, third edition. \newblock John Wiley \& Sons, Hoboken, NJ. \bibitem[Eck and Geyer(submitted)]{eck-geyer} Eck, D.~J., and Geyer, C.~J. \newblock Computationally efficient likelihood inference in exponential families when the maximum likelihood estimator does not exist. \newblock Submitted to \emph{Annals of Statistics}. \newblock \url{https://arxiv.org/abs/1803.11240} \bibitem[Geyer(2008)]{tr672} Geyer, C.~J. (2008). \newblock Supporting theory and data analysis for ``Likelihood inference in exponential families and directions of recession''. \newblock Technical Report 672, School of Statistics, University of Minnesota. \newblock \url{http:www.stat.umn.edu/geyer/gdor/phaseTR.pdf}. \bibitem[Geyer(2009)]{geyer-gdor} Geyer, C.~J. (2009). \newblock Likelihood inference in exponential families and directions of recession. \newblock \emph{Electronic Journal of Statistics}, \textbf{3}, 259--289. \bibitem[Geyer(2016)]{infinity} Geyer, C.~J. (2016). \newblock Two Examples of Agresti. \newblock Class notes, PDF \url{http://www.stat.umn.edu/geyer/8931expfam/infinity.pdf}, knitr source \url{http://www.stat.umn.edu/geyer/8931expfam/infinity.Rnw} \bibitem[Geyer(2018)]{talk} Geyer, C.~J. (2018). \newblock Fast Valid Statistical Inference when the Maximum Likelihood Estimate Does Not Exist in an Exponential Family Model and the "Usual Asymptotics" are Bogus. \newblock Talk at Mini-Conference to Celebrate Elizabeth Thompson's Contributions to Statistics, Genetics and the University of Washington, June 19, 2018. \newblock \url{http://users.stat.umn.edu/~geyer/ElizabethFest/} \end{thebibliography} \end{document}