diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 8e34fc0dcd7..3ccd35d9b1b 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -13,8 +13,8 @@ A clear and concise description of what the problem is. Ex. I'm always frustrate
 #### Describe the solution you'd like
 A clear and concise description of what you want to happen.
 
-#### Describe alternatives you've considered
-A clear and concise description of any alternative solutions or features you've considered.
+#### Describe alternatives you have considered
+A clear and concise description of any alternative solutions or features you have considered.
 
 #### Additional context
 Add any other context about the feature request here.
\ No newline at end of file
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index eae5469bc6d..8c8101c1711 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -35,7 +35,7 @@ For a pull request to be accepted, you must meet the below requirements. This gr
 Linting
 ~~~~~~~
 
-Due to the way we have the CI builds set up, the linter won't do anything unless the environmental variable $LINT is set to a truthy value.
+Due to the way we have the CI builds set up, the linter will not do anything unless the environmental variable $LINT is set to a truthy value.
 
 - On MacOS/Linux
 
@@ -46,7 +46,7 @@ Due to the way we have the CI builds set up, the linter won't do anything unless
 How to Submit a Pull Request
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-So you want to submit a patch to `statsmodels` but aren't too familiar with github? Here are the steps you need to take.
+So you want to submit a patch to `statsmodels` but are not too familiar with github? Here are the steps you need to take.
 
 1. `Fork <https://help.github.com/articles/fork-a-repo>`_ the `statsmodels repository <https://github.com/statsmodels/statsmodels>`_ on Github.
 2. `Create a new feature branch <https://git-scm.com/book/en/Git-Branching-Basic-Branching-and-Merging>`_. Each branch must be self-contained, with a single new feature or bugfix.
diff --git a/COPYRIGHTS.txt b/COPYRIGHTS.txt
index ac4447ab125..3a505ea3db3 100644
--- a/COPYRIGHTS.txt
+++ b/COPYRIGHTS.txt
@@ -5,7 +5,7 @@ statsmodels contains code or derivative code from several other
 packages. Some modules also note the author of individual contributions, or
 author of code that formed the basis for the derived or translated code.
 The copyright statements for the datasets are attached to the individual
-datasets, most datasets are in public domain, and we don't claim any copyright
+datasets, most datasets are in public domain, and we do not claim any copyright
 on any of them.
 
 In the following, we collect copyright statements of code from other packages,
diff --git a/docs/make.bat b/docs/make.bat
index db86deb117e..9ce46a1c4fe 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -28,7 +28,7 @@ if errorlevel 9009 (
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
-	echo.If you don't have Sphinx installed, grab it from
+	echo.If you do not have Sphinx installed, grab it from
 	echo.http://sphinx-doc.org/
 	exit /b 1
 )
diff --git a/docs/source/_static/mktree.js b/docs/source/_static/mktree.js
index 6322c5eeca9..738cbbc7e1e 100644
--- a/docs/source/_static/mktree.js
+++ b/docs/source/_static/mktree.js
@@ -3,7 +3,7 @@
  *
  * Dual licensed under the MIT and GPL licenses.
  * This basically means you can use this code however you want for
- * free, but don't claim to have written it yourself!
+ * free, but do not claim to have written it yourself!
  * Donations always accepted: https://www.JavascriptToolbox.com/donate/
  *
  * Please do not link to the .js files on javascripttoolbox.com from
@@ -103,7 +103,7 @@ function convertTrees() {
 	setDefault("nodeLinkClass","bullet");
 	setDefault("preProcessTrees",true);
 	if (preProcessTrees) {
-		if (!document.createElement) { return; } // Without createElement, we can't do anything
+		if (!document.createElement) { return; } // Without createElement, we cannot do anything
 		var uls = document.getElementsByTagName("ul");
 		if (uls==null) { return; }
 		var uls_length = uls.length;
diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst
index 64c1b11e9f8..61af3c80cd0 100644
--- a/docs/source/_templates/autosummary/class.rst
+++ b/docs/source/_templates/autosummary/class.rst
@@ -2,7 +2,7 @@
 
 {% block methods %}
 {% if methods %}
-   .. HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. HACK -- the point here is that we do not want this to appear in the output, but the autosummary should still generate the pages.
       .. autosummary::
          :toctree:
       {% for item in all_methods %}
@@ -15,7 +15,7 @@
 
 {% block attributes %}
 {% if attributes %}
-   .. HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. HACK -- the point here is that we do not want this to appear in the output, but the autosummary should still generate the pages.
       .. autosummary::
          :toctree:
       {% for item in all_attributes %}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8261a2a0d3f..08f53e1c1ad 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -148,7 +148,7 @@
 # html_theme = 'default'
 
 if 'htmlhelp' in sys.argv:
-    # html_theme = 'statsmodels_htmlhelp'  #doesn't look nice yet
+    # html_theme = 'statsmodels_htmlhelp'  #does not look nice yet
     html_theme = 'default'
     print('################# using statsmodels_htmlhelp ############')
 else:
diff --git a/docs/source/contrasts.rst b/docs/source/contrasts.rst
index 910e328aaa1..c349f5460cf 100644
--- a/docs/source/contrasts.rst
+++ b/docs/source/contrasts.rst
@@ -7,7 +7,7 @@ Patsy: Contrast Coding Systems for categorical variables
 
 A categorical variable of K categories, or levels, usually enters a regression as a sequence of K-1 dummy variables. This amounts to a linear hypothesis on the level means. That is, each test statistic for these variables amounts to testing whether the mean for that level is statistically significantly different from the mean of the base category. This dummy coding is called Treatment coding in R parlance, and we will follow this convention. There are, however, different coding methods that amount to different sets of linear hypotheses.
 
-In fact, the dummy coding is not technically a contrast coding. This is because the dummy variables add to one and are not functionally independent of the model's intercept. On the other hand, a set of *contrasts* for a categorical variable with `k` levels is a set of `k-1` functionally independent linear combinations of the factor level means that are also independent of the sum of the dummy variables. The dummy coding isn't wrong *per se*. It captures all of the coefficients, but it complicates matters when the model assumes independence of the coefficients such as in ANOVA. Linear regression models do not assume independence of the coefficients and thus dummy coding is often the only coding that is taught in this context.
+In fact, the dummy coding is not technically a contrast coding. This is because the dummy variables add to one and are not functionally independent of the model's intercept. On the other hand, a set of *contrasts* for a categorical variable with `k` levels is a set of `k-1` functionally independent linear combinations of the factor level means that are also independent of the sum of the dummy variables. The dummy coding is not wrong *per se*. It captures all of the coefficients, but it complicates matters when the model assumes independence of the coefficients such as in ANOVA. Linear regression models do not assume independence of the coefficients and thus dummy coding is often the only coding that is taught in this context.
 
 To have a look at the contrast matrices in Patsy, we will use data from UCLA ATS. First let's load the data.
 
@@ -72,7 +72,7 @@ Here we used `reference=0`, which implies that the first level, Hispanic, is the
 
    contrast.matrix[hsb2.race-1, :][:20]
 
-This is a bit of a trick, as the `race` category conveniently maps to zero-based indices. If it does not, this conversion happens under the hood, so this won't work in general but nonetheless is a useful exercise to fix ideas. The below illustrates the output using the three contrasts above
+This is a bit of a trick, as the `race` category conveniently maps to zero-based indices. If it does not, this conversion happens under the hood, so this will not work in general but nonetheless is a useful exercise to fix ideas. The below illustrates the output using the three contrasts above
 
 .. ipython:: python
 
@@ -113,7 +113,7 @@ Sum coding compares the mean of the dependent variable for a given level to the
    res = mod.fit()
    print(res.summary())
 
-This correspons to a parameterization that forces all the coefficients to sum to zero. Notice that the intercept here is the grand mean where the grand mean is the mean of means of the dependent variable by each level.
+This corresponds to a parameterization that forces all the coefficients to sum to zero. Notice that the intercept here is the grand mean where the grand mean is the mean of means of the dependent variable by each level.
 
 .. ipython:: python
 
diff --git a/docs/source/datasets/dataset_proposal.rst b/docs/source/datasets/dataset_proposal.rst
index b1ad4c8b917..5bc8fbb9926 100644
--- a/docs/source/datasets/dataset_proposal.rst
+++ b/docs/source/datasets/dataset_proposal.rst
@@ -128,7 +128,7 @@ Remaining problems:
       we want to avoid loading all the data in memory? Can we use memory
       mapped arrays ?
     - Missing data: I thought about subclassing both record arrays and
-      masked arrays classes, but I don't know if this is feasable, or even
+      masked arrays classes, but I do not know if this is feasible, or even
       makes sense. I have the feeling that some Data mining software use
       Nan (for example, weka seems to use float internally), but this
       prevents them from representing integer data.
diff --git a/docs/source/dev/git_notes.rst b/docs/source/dev/git_notes.rst
index 5062c4a7821..4bd341b330b 100644
--- a/docs/source/dev/git_notes.rst
+++ b/docs/source/dev/git_notes.rst
@@ -157,7 +157,7 @@ change history by::
     git log --oneline --graph
 
 It pays to take care of things locally before you push them to github. So when
-in doubt, don't push.  Also see the advice on keeping your history clean in
+in doubt, do not push.  Also see the advice on keeping your history clean in
 :ref:`merge-vs-rebase`.
 
 .. _pull-requests:
@@ -193,7 +193,7 @@ One last thing to note. If there has been a lot of work in upstream/master
 since you started your patch, you might want to rebase. However, you can
 probably get away with not rebasing if these changes are unrelated to the work
 you have done in the `shiny-new-feature` branch. If you can avoid it, then
-don't rebase. If you have to, try to do it once and when you are at the end of
+do not rebase. If you have to, try to do it once and when you are at the end of
 your changes. Read on for some notes on :ref:`merge-vs-rebase`.
 
 Advanced Topics
@@ -221,7 +221,7 @@ the warnings
 Namely, **always make a new branch before doing a rebase**. This is good
 general advice for working with git. I would also add **never use rebase on
 work that has already been published**. If another developer is using your
-work, don't rebase!!
+work, do not rebase!!
 
 As for merging, **never merge from trunk into your feature branch**. You will,
 however, want to check that your work will merge cleanly into trunk. This will
@@ -253,7 +253,7 @@ however. To delete the branch on github, do::
 .. Squashing with Rebase
 .. ^^^^^^^^^^^^^^^^^^^^^
 
-.. You've made a bunch of incremental commits, but you think they might be better off together as one
+.. You have made a bunch of incremental commits, but you think they might be better off together as one
 .. commit. You can do this with an interactive rebase. As usual, **only do this when you have local
 .. commits. Do not edit the history of changes that have been pushed.**
 
diff --git a/docs/source/dev/index.rst b/docs/source/dev/index.rst
index 118ca836e1c..2de8dbcbb37 100644
--- a/docs/source/dev/index.rst
+++ b/docs/source/dev/index.rst
@@ -60,7 +60,7 @@ greatly helps the job of maintaining and releasing the software a shared effort.
 How to Submit a Pull Request
 ----------------------------
 
-So you want to submit a patch to `statsmodels` but aren't too familiar with
+So you want to submit a patch to `statsmodels` but are not too familiar with
 github? Here are the steps you need to take.
 
 1. `Fork <https://help.github.com/articles/fork-a-repo/>`_ the
diff --git a/docs/source/dev/maintainer_notes.rst b/docs/source/dev/maintainer_notes.rst
index 197e59fe4fe..40e99d0777b 100644
--- a/docs/source/dev/maintainer_notes.rst
+++ b/docs/source/dev/maintainer_notes.rst
@@ -34,7 +34,7 @@ If there are only a few commits, you can rebase to keep a linear history::
     git rebase upstream-rw/master
 
 Rebasing will not automatically close the pull request however, if there is one,
-so don't forget to do this.
+so do not forget to do this.
 
 .. _merging:
 
diff --git a/docs/source/dev/naming_conventions.rst b/docs/source/dev/naming_conventions.rst
index a63173e956f..2ab52eaaab1 100644
--- a/docs/source/dev/naming_conventions.rst
+++ b/docs/source/dev/naming_conventions.rst
@@ -41,7 +41,7 @@ Our directory tree stripped down looks something like::
 The submodules are arranged by topic, `discrete` for discrete choice models, or `tsa` for time series
 analysis. The submodules that can be import heavy contain an empty __init__.py, except for some testing
 code for running tests for the submodules. The namespace to be imported is in `api.py`. That way, we
-can import selectively and do not have to import a lot of code that we don't need. Helper functions are
+can import selectively and do not have to import a lot of code that we do not need. Helper functions are
 usually put in files named `tools.py` and statistical functions, such as statistical tests are placed
 in `stattools.py`. Everything has directories for :ref:`tests <testing>`.
 
@@ -83,7 +83,7 @@ time-series ARMA model we have::
 Options
 ~~~~~~~
 We are using similar options in many classes, methods and functions. They
-should follow a standardized pattern if they recurr frequently. ::
+should follow a standardized pattern if they recur frequently. ::
 
     `missing` ['none', 'drop', 'raise'] define whether inputs are checked for
         nans, and how they are treated
diff --git a/docs/source/diagnostic.rst b/docs/source/diagnostic.rst
index 9d9f104bfa0..20d5d46c120 100644
--- a/docs/source/diagnostic.rst
+++ b/docs/source/diagnostic.rst
@@ -113,7 +113,7 @@ Unknown Change Point
 :py:func:`recursive_olsresiduals <statsmodels.stats.diagnostic.recursive_olsresiduals>`
   Calculate recursive ols with residuals and cusum test statistic. This is
   currently mainly helper function for recursive residual based tests.
-  However, since it uses recursive updating and doesn't estimate separate
+  However, since it uses recursive updating and does not estimate separate
   problems it should be also quite efficient as expanding OLS function.
 
 missing
@@ -122,7 +122,7 @@ missing
   - test on recursive parameter estimates, which are there?
 
 
-Mutlicollinearity Tests
+Multicollinearity Tests
 --------------------------------
 
 conditionnum (statsmodels.stattools)
diff --git a/docs/source/faq.rst b/docs/source/faq.rst
index 6226f21a0d8..34c68707067 100644
--- a/docs/source/faq.rst
+++ b/docs/source/faq.rst
@@ -39,7 +39,7 @@ takes this keyword. You can find more information in the docstring of
 
 .. _build-faq:
 
-Why won't statsmodels build?
+Why will not statsmodels build?
 ----------------------------
 
 Remember that to build, you must have:
@@ -75,7 +75,7 @@ get involved. We accept Pull Requests on our GitHub page for bugfixes and
 topics germane to statistics and statistical modeling. In addition, usability
 and quality of life enhancements are greatly appreciated as well.
 
-What if my question isn't answered here?
+What if my question is not answered here?
 ----------------------------------------
 
 You may find answers for questions that have not yet been added here on GitHub
diff --git a/docs/source/names_wordlist.txt b/docs/source/names_wordlist.txt
index ddadd3f073c..7ac886f913f 100644
--- a/docs/source/names_wordlist.txt
+++ b/docs/source/names_wordlist.txt
@@ -89,3 +89,11 @@ Longley
 Koenker
 gliptak
 Spector
+Wes
+statawriter
+Nonparameteric
+prerotated
+uniq
+exceedance
+separatevar
+
diff --git a/docs/source/nonparametric.rst b/docs/source/nonparametric.rst
index d9a47f55304..64d82887798 100644
--- a/docs/source/nonparametric.rst
+++ b/docs/source/nonparametric.rst
@@ -11,7 +11,7 @@ includes kernel density estimation for univariate and multivariate data,
 kernel regression and locally weighted scatterplot smoothing (lowess).
 
 sandbox.nonparametric contains additional functions that are work in progress
-or don't have unit tests yet. We are planning to include here nonparametric
+or do not have unit tests yet. We are planning to include here nonparametric
 density estimators, especially based on kernel or orthogonal polynomials,
 smoothers, and tools for nonparametric models and methods in other parts of
 statsmodels.
diff --git a/docs/source/plots/graphics_gofplots_qqplot_qqline.py b/docs/source/plots/graphics_gofplots_qqplot_qqline.py
index a1126cb56d2..7a1333cd17d 100644
--- a/docs/source/plots/graphics_gofplots_qqplot_qqline.py
+++ b/docs/source/plots/graphics_gofplots_qqplot_qqline.py
@@ -1,5 +1,5 @@
 '''
-    Import the food expenditure dataset.  Plot annual food expendeture on
+    Import the food expenditure dataset.  Plot annual food expenditure on
     x-axis and household income on y-axis.  Use qqline to add regression line
     into the plot.
 '''
diff --git a/docs/source/plots/graphics_mosaicplot_mosaic.py b/docs/source/plots/graphics_mosaicplot_mosaic.py
index acb838c5a54..9bc08c40f25 100644
--- a/docs/source/plots/graphics_mosaicplot_mosaic.py
+++ b/docs/source/plots/graphics_mosaicplot_mosaic.py
@@ -25,7 +25,7 @@
 mosaic(data, title='hierarchical index series')
 plt.show()
 
-# The third accepted data structureis the np array, for which a very simple
+# The third accepted data structure is the np array, for which a very simple
 # index will be created.
 rand = np.random.random
 data = 1+rand((2, 2))
diff --git a/docs/source/plots/graphics_plot_fit_ex.py b/docs/source/plots/graphics_plot_fit_ex.py
index 347d689a887..9296ef7685e 100644
--- a/docs/source/plots/graphics_plot_fit_ex.py
+++ b/docs/source/plots/graphics_plot_fit_ex.py
@@ -8,7 +8,7 @@
 """
 
 # Load the Statewide Crime data set and perform linear regression with
-#    'poverty' and 'hs_grad' as variables and 'muder' as the response
+#    'poverty' and 'hs_grad' as variables and 'murder' as the response
 
 
 import statsmodels.api as sm
diff --git a/docs/source/plots/graphics_regression_regress_exog.py b/docs/source/plots/graphics_regression_regress_exog.py
index a7a2ae37265..fd1c06ce067 100644
--- a/docs/source/plots/graphics_regression_regress_exog.py
+++ b/docs/source/plots/graphics_regression_regress_exog.py
@@ -3,7 +3,7 @@
 Load the Statewide Crime data set and build a model with regressors
 including the rate of high school graduation (hs_grad), population in urban
 areas (urban), households below poverty line (poverty), and single person
-households (single).  Outcome variable is the muder rate (murder).
+households (single).  Outcome variable is the murder rate (murder).
 
 Build a 2 by 2 figure based on poverty showing fitted versus actual murder
 rate, residuals versus the poverty rate, partial regression plot of poverty,
diff --git a/docs/source/release/old_changes.rst b/docs/source/release/old_changes.rst
index 1290bee5d64..1f5c2dc1a81 100644
--- a/docs/source/release/old_changes.rst
+++ b/docs/source/release/old_changes.rst
@@ -59,7 +59,7 @@ This is a bug-fix release, that affects mainly Big-Endian machines.
 *Bug Fixes*
 
 * discrete_model.MNLogit fix summary method
-* tsa.filters.hp_filter don't use umfpack on Big-Endian machine (scipy bug)
+* tsa.filters.hp_filter do not use umfpack on Big-Endian machine (scipy bug)
 * the remaining fixes are in the test suite, either precision problems
   on some machines or incorrect testing on Big-Endian machines.
 
diff --git a/docs/source/release/version0.10.rst b/docs/source/release/version0.10.rst
index cb793d2997a..dff2fb49db7 100644
--- a/docs/source/release/version0.10.rst
+++ b/docs/source/release/version0.10.rst
@@ -472,7 +472,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`4853`: CLN/REF: Remove recarrays from datasets
 * :pr:`4855`: BUG: Attach vc_names for mixed Poisson models
 * :pr:`4858`: MAINT: Delete migrate_issues_gh
-* :pr:`4859`: Fix some NameErrors, don't delete unused [...]
+* :pr:`4859`: Fix some NameErrors, do not delete unused [...]
 * :pr:`4861`: DOC: Fix small doc errors
 * :pr:`4864`: CLN: fix and lint for W391 blank line at end of file
 * :pr:`4869`: Update setup.cfg
@@ -525,7 +525,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`4979`: [MAINT/TST] remove np.testing.dec unused imports (nose dependency)
 * :pr:`4980`: [MAINT/TST] skip/xfail tests instead of mangling/commenting-out in genmod, regression
 * :pr:`4981`: [MAINT] Remove info.py
-* :pr:`4982`: DOC Fix typo Paramters-->Parameters
+* :pr:`4982`: DOC Fix typo Parameters-->Parameters
 * :pr:`4983`: [TST] xfail/skip instead of commenting-out/mangling discrete tests
 * :pr:`4984`: [TST/DOC] make commented-out code in tests/results into readable docs
 * :pr:`4985`: [TST/DOC] Make test comments more readable
@@ -545,7 +545,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`5007`: MAINT: unused imports in robust
 * :pr:`5011`: [MAINT] remove text file relics from scikits/statsmodels
 * :pr:`5012`: [MAINT/TST] move misplaced results files in regressions/tests
-* :pr:`5013`: [MAINT] fix typo depricated-->deprecated
+* :pr:`5013`: [MAINT] fix typo deprecated-->deprecated
 * :pr:`5014`: [MAINT] typo in __init__ signature
 * :pr:`5015`: [MAINT] move misplaced test_tsa_indexes
 * :pr:`5016`: ENH: Burgs algorithm
@@ -559,7 +559,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`5043`: BUG: Initialization when simulating
 * :pr:`5045`: MAINT: strict linting for tsa.statespace.tests.results
 * :pr:`5057`: BUG: Correct check for callable
-* :pr:`5058`: BUG: Don't use mutable default values
+* :pr:`5058`: BUG: Do not use mutable default values
 * :pr:`5059`: BLD: Add line displaying CPU info to CI
 * :pr:`5065`: TST: Fix incorrect assertion
 * :pr:`5070`: MAINT: remove file that just says to remove it
@@ -572,7 +572,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`5083`: ENH: Add "(uncentered)" after rsquared label in .summary, .summary2 when appropriate
 * :pr:`5086`: TST: parametrize tests instead of using for loops
 * :pr:`5088`: DOC: Add javascript to link to other doc versions
-* :pr:`5090`: MAINT: Chrome doesn't like having a secure link with an unsecure image
+* :pr:`5090`: MAINT: Chrome does not like having a secure link with an unsecure image
 * :pr:`5093`: Allow results to be stored for multiple imputation
 * :pr:`5096`: ENH remove unneeded restriction on QIC (GEE)
 * :pr:`5099`: MAINT: fix and lint for W292 newline at end of file
@@ -763,7 +763,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`5640`: ENH: Improve error when ARMA endog is not 1d
 * :pr:`5641`: Josef pkt svar irf errband 5280
 * :pr:`5642`: TST: Relax tolerance on OSX for OpenBlas issues
-* :pr:`5643`: MAINT: Consoliate platform checks
+* :pr:`5643`: MAINT: Consolidate platform checks
 * :pr:`5644`: CLN/DOC: Remove unused module, vbench references
 * :pr:`5645`: TST: Allow network failure in web tests
 * :pr:`5646`: BUG: Fix MANOVA when not using formulas
@@ -806,7 +806,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`5695`: MAINT: remove NameError-having version of levinson_durbin, just keep …
 * :pr:`5696`: CLN: remove identical functions from garch
 * :pr:`5697`: CLN: strict linting for examples/
-* :pr:`5698`: PERF: Avoid impplicit check when hasconst
+* :pr:`5698`: PERF: Avoid implicit check when hasconst
 * :pr:`5699`: BUG: Limit lag length in adf
 * :pr:`5700`: MAINT: Update import of URLError
 * :pr:`5701`: MAINT: missing imports, typos, fixes several NameErrors
@@ -838,7 +838,7 @@ Thie following Pull Requests were merged since the last release:
 * :pr:`5740`: CLN: strict linting for tsa.tests.results
 * :pr:`5742`: CLN: strict linting for remaining results directories
 * :pr:`5743`: CLN: strict linting for results files in sandbox/regression/tests/
-* :pr:`5744`: CLN: Fix/lint for dangerious redefinitions and comparisons
+* :pr:`5744`: CLN: Fix/lint for dangerous redefinitions and comparisons
 * :pr:`5746`: MAINT: fix missing or redundant imports
 * :pr:`5748`: CLN: clean up adfvalues, avoid using `eval`
 * :pr:`5750`: CLN: E131 hanging indentation alignment
diff --git a/docs/source/release/version0.5.rst b/docs/source/release/version0.5.rst
index 140e86072f2..2f52266b102 100644
--- a/docs/source/release/version0.5.rst
+++ b/docs/source/release/version0.5.rst
@@ -91,7 +91,7 @@ New and Improved Graphics
 
 * **Mosaic Plot**: Create a mosaic plot from a contingency table. This allows you to visualize multivariate categorical data in a rigorous and informative way. Available with :func:`sm.graphics.mosaic <mosaicplot.mosaic>`.
 
-* **Interaction Plot**: Interaction plots now handle categorical factors as well as other improviments. :func:`sm.graphics.interaction_plot <factorplots.interaction_plot>`.
+* **Interaction Plot**: Interaction plots now handle categorical factors as well as other improvements. :func:`sm.graphics.interaction_plot <factorplots.interaction_plot>`.
 
 * **Regression Plots**: The regression plots have been refactored and improved. They can now handle pandas objects and regression results instances appropriately. See :func:`sm.graphics.plot_fit <regressionplots.plot_fit>`, :func:`sm.graphics.plot_regress_exog <regressionplots.plot_regress_exog>`, :func:`sm.graphics.plot_partregress <regressionplots.plot_partregress>`, :func:`sm.graphics.plot_ccpr   <regressionplots.plot_ccpr>`, :func:`sm.graphics.abline_plot <regressionplots.abline_plot>`, :func:`sm.graphics.influence_plot <regressionplots.influence_plot>`, and :func:`sm.graphics.plot_leverage_resid2 <regressionplots.plot_leverage_resid2>`.
 
@@ -182,7 +182,7 @@ Development summary and credits
 
 The previous version (statsmodels 0.4.3) was released on July 2, 2012. Since then we have closed a total of 380 issues, 172 pull requests and 208 regular issues. The :ref:`detailed list<issues_list_05>` can be viewed.
 
-This release is a result of the work of the following 38 authors who contributed total of 2032 commits. If for any reason, we've failed to list your name in the below, please contact us:
+This release is a result of the work of the following 38 authors who contributed total of 2032 commits. If for any reason, we have failed to list your name in the below, please contact us:
 
 * Ana Martinez Pardo <anamartinezpardo-at-gmail.com>
 * anov <novikova.go.zoom-at-gmail.com>
@@ -428,7 +428,7 @@ Issues (208):
 * :ghissue:`533`: py3 compatibility ``pandas.read_csv(urlopen(...))``
 * :ghissue:`662`: doc: install instruction: explicit about removing scikits.statsmodels
 * :ghissue:`910`: test failure Ubuntu TestARMLEConstant.test_dynamic_predict
-* :ghissue:`80`: t_model: f_test, t_test don't work
+* :ghissue:`80`: t_model: f_test, t_test do not work
 * :ghissue:`432`: GenericLikelihoodModel change default for score and hessian
 * :ghissue:`454`: BUG/ENH: HuberScale instance is not used, allow user defined scale estimator
 * :ghissue:`98`: check connection or connect summary to variable names in wrappers
@@ -438,14 +438,14 @@ Issues (208):
 * :ghissue:`1011`: power ttest endless loop possible
 * :ghissue:`907`: BLD data_files for stats.libqsturng
 * :ghissue:`328`: consider moving example scripts into IPython notebooks
-* :ghissue:`1002`: Docs won't build with Sphinx 1.1.3
+* :ghissue:`1002`: Docs will not build with Sphinx 1.1.3
 * :ghissue:`69`: Make methods like compare_ftest work with wrappers
 * :ghissue:`503`: summary_old in RegressionResults
 * :ghissue:`991`: TST precision of normal_power
 * :ghissue:`945`: Installing statsmodels from github?
 * :ghissue:`964`: Prefer to_offset not get_offset in tsa stuff
 * :ghissue:`983`: bug: pandas 0.8.1 incompatibility
-* :ghissue:`899`: build_ext inplace doesn't cythonize
+* :ghissue:`899`: build_ext inplace does not cythonize
 * :ghissue:`923`: location of initialization code
 * :ghissue:`980`: auto lag selection in  S_hac_simple
 * :ghissue:`968`: genericMLE Ubuntu test failure
@@ -479,7 +479,7 @@ Issues (208):
 * :ghissue:`630`: problems in regression plots
 * :ghissue:`885`: Caching behavior for KDEUnivariate icdf
 * :ghissue:`869`: sm.tsa.ARMA(..., order=(p,q)) gives "__init__() got an unexpected keyword argument 'order'" error
-* :ghissue:`783`: statsmodels\distributions\mixture_rvs.py    no unit tests
+* :ghissue:`783`: statsmodels.distributions.mixture_rvs.py    no unit tests
 * :ghissue:`824`: Multicomparison w/Pandas Series
 * :ghissue:`789`: presentation of multiple comparison results
 * :ghissue:`764`: BUG: multipletests incorrect reject for Holm-Sidak
@@ -528,7 +528,7 @@ Issues (208):
 * :ghissue:`716`: Tests missing for functions if pandas is used
 * :ghissue:`715`: statsmodels regression plots not working with pandas datatypes
 * :ghissue:`450`: BUG: full_output in optimizers Likelihood model
-* :ghissue:`709`: DOCstrings linear models don't have missing params
+* :ghissue:`709`: DOCstrings linear models do not have missing params
 * :ghissue:`370`: BUG weightstats has wrong cov
 * :ghissue:`694`: DiscreteMargins duplicate method
 * :ghissue:`702`: bug, pylint stats.anova
@@ -548,7 +548,7 @@ Issues (208):
 * :ghissue:`679`: Test Baxter King band-pass filter fails with scipy 0.12 beta1
 * :ghissue:`552`: influence outliers breaks when regressing on constant
 * :ghissue:`639`: test folders not on python path
-* :ghissue:`565`: omni_normtest doesn't propagate the axis argument
+* :ghissue:`565`: omni_normtest does not propagate the axis argument
 * :ghissue:`563`: error in doc generation for AR.fit
 * :ghissue:`109`: TestProbitCG failure on Ubuntu
 * :ghissue:`661`: from scipy import comb fails on the latest scipy 0.11.0
@@ -574,12 +574,12 @@ Issues (208):
 * :ghissue:`542`: Regression plots fail when Series objects passed to sm.OLS
 * :ghissue:`239`: release 0.4.x
 * :ghissue:`530`: l1 docs issues
-* :ghissue:`539`: test for statwriter (failure)
+* :ghissue:`539`: test for statawriter (failure)
 * :ghissue:`490`: Travis CI on PRs
 * :ghissue:`252`: doc: distributions.rst refers to sandbox only
 * :ghissue:`85`: release 0.4
 * :ghissue:`65`: MLE fit of AR model has no tests
-* :ghissue:`522`: ``test`` doesn't propagate arguments to nose
+* :ghissue:`522`: ``test`` does not propagate arguments to nose
 * :ghissue:`517`: missing array conversion or shape in linear model
 * :ghissue:`523`: test failure with ubuntu decimals too large
 * :ghissue:`520`: web site documentation, source not updated
@@ -616,7 +616,7 @@ Issues (208):
 * :ghissue:`399`: discrete errors due to incorrect in-place operation
 * :ghissue:`389`: VAR test_normality is broken with KeyError
 * :ghissue:`388`: Add tsaplots to graphics.api as graphics.tsa
-* :ghissue:`387`: predict date wasn't getting set with start = None
+* :ghissue:`387`: predict date was not getting set with start = None
 * :ghissue:`386`: p-values not returned from acf
 * :ghissue:`385`: Allow AR.select_order to work without model being fit
 * :ghissue:`383`: Move mixture_rvs out of sandbox.
diff --git a/docs/source/release/version0.6.rst b/docs/source/release/version0.6.rst
index 785a5deb66e..300d9a1d861 100644
--- a/docs/source/release/version0.6.rst
+++ b/docs/source/release/version0.6.rst
@@ -298,10 +298,10 @@ Pull Requests (276):
 * :ghpull:`1954`: ENH: PHReg formula improvements
 * :ghpull:`2007`: BLD: Fix build issues
 * :ghpull:`2006`: BLD: Do not generate cython on clean. Closes #1852.
-* :ghpull:`2000`: BLD: Let pip/setuptools handle dependencies that aren't installed at all.
+* :ghpull:`2000`: BLD: Let pip/setuptools handle dependencies that are not installed at all.
 * :ghpull:`1999`: Gee offset exposure 1994 rebased
 * :ghpull:`1998`: BUG/ENH Lasso emptymodel rebased
-* :ghpull:`1989`: BUG/ENH: WLS generic robust cov_type didn't use whitened,
+* :ghpull:`1989`: BUG/ENH: WLS generic robust cov_type did not use whitened,
 * :ghpull:`1587`: ENH: Wrap X12/X13-ARIMA AUTOMDL. Closes #442.
 * :ghpull:`1563`: ENH: Add plot_predict method to ARIMA models.
 * :ghpull:`1995`: BUG: Fix issue #1993
@@ -331,7 +331,7 @@ Pull Requests (276):
 * :ghpull:`1938`: ENH: Enable Python 3.4 testing
 * :ghpull:`1924`: Bug gee cov type 1906 rebased
 * :ghpull:`1870`: robust covariance, cov_type in fit
-* :ghpull:`1859`: BUG: Don't use negative indexing with k_ar == 0. Closes #1858.
+* :ghpull:`1859`: BUG: Do not use negative indexing with k_ar == 0. Closes #1858.
 * :ghpull:`1914`: BUG: LikelihoodModelResults.pvalues use df_resid_inference
 * :ghpull:`1899`: TST: fix assert_equal for pandas index
 * :ghpull:`1895`: Bug multicomp pandas
@@ -347,7 +347,7 @@ Pull Requests (276):
 * :ghpull:`1867`: Ref covtype fit
 * :ghpull:`1865`: Disable tst distribution 1864
 * :ghpull:`1856`: _spg_optim returns history of objective function values
-* :ghpull:`1854`: BLD: Don't hard-code path for building notebooks. Closes #1249
+* :ghpull:`1854`: BLD: Do not hard-code path for building notebooks. Closes #1249
 * :ghpull:`1851`: MAINT: Cor nearest factor tests
 * :ghpull:`1847`: Newton regularize
 * :ghpull:`1623`: BUG Negbin fit regularized
@@ -379,17 +379,17 @@ Pull Requests (276):
 * :ghpull:`1692`: OSL Example: redundant cell in example removed
 * :ghpull:`1688`: Kshedden mixed rebased of #1398
 * :ghpull:`1629`: Pull request to fix bandwidth bug in issue 597
-* :ghpull:`1666`: Include pyx in sdist but don't install
+* :ghpull:`1666`: Include pyx in sdist but do not install
 * :ghpull:`1683`: TST: GLM shorten random seed closes #1682
 * :ghpull:`1681`: Dotplot kshedden rebased of 1294
 * :ghpull:`1679`: BUG: Fix problems with predict handling offset and exposure
 * :ghpull:`1677`: Update docstring of RegressionModel.predict()
-* :ghpull:`1635`: Allow offset and exposure to be used together with log link; raise excep...
+* :ghpull:`1635`: Allow offset and exposure to be used together with log link; raise except...
 * :ghpull:`1676`: Tests for SVAR
 * :ghpull:`1671`: ENH: avoid hard-listed bandwidths -- use present dictionary (+typos fixed)
 * :ghpull:`1643`: Allow matrix structure in covariance matrices to be exploited
 * :ghpull:`1657`: BUG: Fix refactor victim.
-* :ghpull:`1630`: DOC: typo, "interecept"
+* :ghpull:`1630`: DOC: typo, "intercept"
 * :ghpull:`1619`: MAINT: Dataset docs cleanup and automatic build of docs
 * :ghpull:`1612`: BUG/ENH Fix negbin exposure #1611
 * :ghpull:`1610`: BUG/ENH fix llnull, extra kwds to recreate model
@@ -478,7 +478,7 @@ Pull Requests (276):
 * :ghpull:`1404`: Tst fix genmod link tests
 * :ghpull:`1396`: REF: Multipletests reduce memory usage
 * :ghpull:`1380`: DOC :Update vector_ar.rst
-* :ghpull:`1381`: BLD: Don't check dependencies on egg_info for pip. Closes #1267.
+* :ghpull:`1381`: BLD: Do not check dependencies on egg_info for pip. Closes #1267.
 * :ghpull:`1302`: BUG: Fix typo.
 * :ghpull:`1375`: STY: Remove unused imports and comment out unused libraries in setup.py
 * :ghpull:`1143`: DOC: Update backport notes for new workflow.
@@ -562,7 +562,7 @@ Issues (252):
 * :ghissue:`1220`: missing in extra data (example sandwiches, robust covariances)
 * :ghissue:`1877`: error with GEE on missing data.
 * :ghissue:`805`: nan with categorical in formula
-* :ghissue:`2036`: test in links require exact class so Logit can't work in place of logit
+* :ghissue:`2036`: test in links require exact class so Logit cannot work in place of logit
 * :ghissue:`2010`: Go over deprecations again for 0.6.
 * :ghissue:`1303`: patsy library not automatically installed
 * :ghissue:`2024`: genmod Links numerical improvements
@@ -577,7 +577,7 @@ Issues (252):
 * :ghissue:`1875`: dtype bug object arrays (raises in clustered standard errors code)
 * :ghissue:`1842`: dtype object, glm.fit() gives AttributeError: sqrt
 * :ghissue:`1300`: Doc errors, missing
-* :ghissue:`1164`: RLM cov_params, t_test, f_test don't use bcov_scaled
+* :ghissue:`1164`: RLM cov_params, t_test, f_test do not use bcov_scaled
 * :ghissue:`1019`: 0.6.0 Roadmap
 * :ghissue:`554`: Prediction Standard Errors
 * :ghissue:`333`: ENH tools: squeeze in R export file
@@ -625,12 +625,12 @@ Issues (252):
 * :ghissue:`1256`: REF: GEE handling of default covariance matrices
 * :ghissue:`1760`: Changing covariance_type on results
 * :ghissue:`1906`: BUG: GEE default covariance is not used
-* :ghissue:`1931`: BUG: GEE subclasses NominalGEE don't work with pandas exog
-* :ghissue:`1904`: GEE Results doesn't have a Wrapper
+* :ghissue:`1931`: BUG: GEE subclasses NominalGEE do not work with pandas exog
+* :ghissue:`1904`: GEE Results does not have a Wrapper
 * :ghissue:`1918`: GEE: required attributes missing, df_resid
 * :ghissue:`1919`: BUG GEE.predict uses link instead of link.inverse
 * :ghissue:`1858`: BUG: arimax forecast should special case k_ar == 0
-* :ghissue:`1903`: BUG: pvalues for cluster robust, with use_t don't use df_resid_inference
+* :ghissue:`1903`: BUG: pvalues for cluster robust, with use_t do not use df_resid_inference
 * :ghissue:`1243`: kde silverman bandwidth for non-gaussian kernels
 * :ghissue:`1866`: Pip dependencies
 * :ghissue:`1850`: TST test_corr_nearest_factor fails on Ubuntu
@@ -653,7 +653,7 @@ Issues (252):
 * :ghissue:`1453`: Discrete NegativeBinomialModel regularized_fit ValueError: matrices are not aligned
 * :ghissue:`1836`: BUG Got an TypeError trying to import statsmodels.api
 * :ghissue:`1829`: BUG: GLM summary show "t"  use_t=True for summary
-* :ghissue:`1828`: BUG summary2 doesn't propagate/use use_t
+* :ghissue:`1828`: BUG summary2 does not propagate/use use_t
 * :ghissue:`1812`: BUG/ REF conf_int and use_t
 * :ghissue:`1835`: Problems with installation using easy_install
 * :ghissue:`1801`: BUG 'f_gen' missing in scipy 0.14.0
@@ -670,7 +670,7 @@ Issues (252):
 * :ghissue:`1323`: Contrast Results after t_test summary broken for 1 parameter
 * :ghissue:`109`: TestProbitCG failure on Ubuntu
 * :ghissue:`1690`: TestProbitCG: 8 failing tests (Python 3.4 / Ubuntu 12.04)
-* :ghissue:`1763`: Johansen method doesn't give correct index values
+* :ghissue:`1763`: Johansen method does not give correct index values
 * :ghissue:`1761`: doc build failures: ipython version ? ipython directive
 * :ghissue:`1762`: Unable to build
 * :ghissue:`1745`: UnicodeDecodeError raised by get_rdataset("Guerry", "HistData")
@@ -734,7 +734,7 @@ Issues (252):
 * :ghissue:`1462`: qqplot line kwarg is broken/docstring is wrong
 * :ghissue:`1457`: BUG/BLD: Failed build if "sandbox" anywhere in statsmodels path
 * :ghissue:`1441`: wls function: syntax error "unexpected EOF while parsing" occurs when name of dependent variable starts with digits
-* :ghissue:`1428`: ipython_directive doesn't work with ipython master
+* :ghissue:`1428`: ipython_directive does not work with ipython master
 * :ghissue:`1385`: SimpleTable in Summary (e.g. OLS) is slow for large models
 * :ghissue:`1399`: UnboundLocalError: local variable 'fittedvalues' referenced before assignment
 * :ghissue:`1377`: TestAnova2.test_results fails with pandas 0.13.1
@@ -746,7 +746,7 @@ Issues (252):
 * :ghissue:`990`: AR fit with bfgs: large score
 * :ghissue:`14`: arma with exog
 * :ghissue:`1348`: reset_index + set_index with drop=False
-* :ghissue:`1343`: ARMA doesn't pass missing keyword up to TimeSeriesModel
+* :ghissue:`1343`: ARMA does not pass missing keyword up to TimeSeriesModel
 * :ghissue:`1326`: formula example notebook broken
 * :ghissue:`1327`: typo in docu-code for "Outlier and Influence Diagnostic Measures"
 * :ghissue:`1309`: Box-Cox transform (some code needed: lambda estimator)
@@ -777,11 +777,11 @@ Issues (252):
 * :ghissue:`1116`: Typo in Example Doc?
 * :ghissue:`1123`: BUG : arima_model._get_predict_out_of_sample, ignores exogenous of there is no trend ?
 * :ghissue:`1155`: ARIMA - The computed initial AR coefficients are not stationary
-* :ghissue:`979`: Win64 binary can't find Python installation
+* :ghissue:`979`: Win64 binary cannot find Python installation
 * :ghissue:`1046`: TST: test_arima_small_data_bug on current master
 * :ghissue:`1146`: ARIMA fit failing for small set of data due to invalid maxlag
 * :ghissue:`1081`: streamline linear algebra for linear model
-* :ghissue:`1138`: BUG: pacf_yw doesn't demean
+* :ghissue:`1138`: BUG: pacf_yw does not demean
 * :ghissue:`1127`: Allow linear link model with Binomial families
 * :ghissue:`1122`: no data cleaning for statsmodels.genmod.families.varfuncs.NegativeBinomial()
 * :ghissue:`658`: robust.mad is not being computed correctly or is non-standard definition; it returns the median
diff --git a/docs/source/release/version0.7.rst b/docs/source/release/version0.7.rst
index 22c81979112..e92dd77f782 100644
--- a/docs/source/release/version0.7.rst
+++ b/docs/source/release/version0.7.rst
@@ -131,7 +131,7 @@ new state space functionality. It can be used very similarly to the existing
 `ARIMA` model, but works on a wider range of specifications, including:
 
 * Additive and multiplicative seasonal effects
-* Flexible trend specications
+* Flexible trend specification
 * Regression with SARIMA errors
 * Regression with time-varying coefficients
 * Measurement error in the endogenous variables
diff --git a/docs/source/release/version0.9.rst b/docs/source/release/version0.9.rst
index 51bd555d8f8..3559088b7e9 100644
--- a/docs/source/release/version0.9.rst
+++ b/docs/source/release/version0.9.rst
@@ -245,7 +245,7 @@ Improved time series index support
 
 Handling of indexes for time series models has been overhauled (#3272) to
 take advantage of recent improvements in Pandas and to shift to Pandas much of
-the special case handling (espcially for date indexes) that had previously been
+the special case handling (especially for date indexes) that had previously been
 done in Statsmodels. Benefits include more consistent behavior, a reduced
 number of bugs from corner cases, and a reduction in the maintenance burden.
 
diff --git a/docs/source/sandbox.rst b/docs/source/sandbox.rst
index 392f17a5ba9..11b3177b134 100644
--- a/docs/source/sandbox.rst
+++ b/docs/source/sandbox.rst
@@ -4,7 +4,7 @@
 Sandbox
 =======
 
-This sandbox contains code that is for various resons not ready to be
+This sandbox contains code that is for various reasons not ready to be
 included in statsmodels proper. It contains modules from the old stats.models
 code that have not been tested, verified and updated to the new statsmodels
 structure: cox survival model, mixed effects model with repeated measures,
diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
index f7fabeefd13..3eee553bbfe 100644
--- a/docs/source/spelling_wordlist.txt
+++ b/docs/source/spelling_wordlist.txt
@@ -2115,3 +2115,98 @@ Spearman
 negsquarenormalg
 Zyl
 www
+Az
+Batchkarov
+Cholette
+Datatype
+Eubank
+Filename
+HistData
+JohansenTestResult
+Multi
+Outlier
+Outliers
+Origintests
+Refactor
+Refactored
+STL
+Terpenning
+Thie
+Tuple
+Unordered
+Workflow
+Bulleted
+Reparameterizing
+distrvs
+esciencecenter
+asym
+avextol
+avishaylivne
+buitinck
+ccovf
+conditionnum
+tm
+tne
+toeplitz
+tq
+tri
+tuple
+tuples
+tvtp
+tw
+ty
+unordered
+unweighted
+xdist
+xopt
+xpoints
+zaemyung
+wargs
+violinplots
+valentin
+tc
+td
+timestamp
+filename
+formulae
+fortran
+gls
+lc
+ld
+ldb
+li
+logit
+meth
+mnenomic
+multi
+nipals
+olsresiduals
+optimizations
+outlier
+outliers
+eim
+epa
+homoskedasticity
+ianlangmore
+ljungbox
+lognormalg
+movmoment
+nitric
+nl
+oo
+refactoring
+sep
+invweights
+linke
+linsomniac
+mvstdnormcdf
+gettingstarted
+groupsswithin
+invdnormalg
+indendented
+gright
+inverter
+prerotated
+uniq
+nparray
+olsresidual
\ No newline at end of file
diff --git a/docs/source/statespace.rst b/docs/source/statespace.rst
index cf4b5a468c5..0acf75400e2 100644
--- a/docs/source/statespace.rst
+++ b/docs/source/statespace.rst
@@ -377,7 +377,7 @@ other Statsmodels results objects, including standard errors, z-statistics,
 and prediction / forecasting.
 
 More advanced usage is possible, including specifying parameter
-transformations, and specifing names for parameters for a more informative
+transformations, and specifying names for parameters for a more informative
 output summary.
 
 State space representation and Kalman filtering
diff --git a/examples/incomplete/glsar.py b/examples/incomplete/glsar.py
index 18efabbd18c..3a803dfd4b9 100644
--- a/examples/incomplete/glsar.py
+++ b/examples/incomplete/glsar.py
@@ -37,7 +37,7 @@
     res = model0if.iterative_fit(6)
     print('iterativefit beta', res.params)
     results.tvalues   # TODO: is this correct? it does equal params/bse
-    # but isn't the same as the AR example (which was wrong in the first place)
+    # but is not the same as the AR example (which was wrong)
     print(results.t_test([0, 1]))  # are sd and t correct? vs
     print(results.f_test(np.eye(2)))
 
diff --git a/examples/incomplete/wls_extended.py b/examples/incomplete/wls_extended.py
index 0933632142c..52db43b6ccb 100644
--- a/examples/incomplete/wls_extended.py
+++ b/examples/incomplete/wls_extended.py
@@ -245,7 +245,7 @@ def index_trim_outlier(resid, k):
 # a quick bootstrap analysis
 # --------------------------
 #
-# (I didn't check whether this is fully correct statistically)
+# (I did not check whether this is fully correct statistically)
 
 # **With OLS on full sample**
 
@@ -380,12 +380,12 @@ def index_trim_outlier(resid, k):
 # for statsmodels
 #
 # * In this case rsquared for original data looks less random/arbitrary.
-# * Don't change definition of rsquared from centered tss to uncentered
+# * Do not change definition of rsquared from centered tss to uncentered
 #   tss when calculating rsquared in WLS if the original exog contains
 #   a constant. The increase in rsquared because of a change in definition
 #   will be very misleading.
 # * Whether there is a constant in the transformed exog, wexog, or not,
-#   might affect also the degrees of freedom calculation, but I haven't
+#   might affect also the degrees of freedom calculation, but I have not
 #   checked this. I would guess that the df_model should stay the same,
 #   but needs to be verified with a textbook.
 # * df_model has to be adjusted if the original data does not have a
@@ -396,7 +396,7 @@ def index_trim_outlier(resid, k):
 #   This can be done through keyword parameter to model.__init__ or
 #   through autodedection with hasconst = (exog.var(0)<1e-10).any()
 #   I'm not sure about fixed effects with a full dummy set but
-#   without a constant. In this case autodedection wouldn't work this
+#   without a constant. In this case autodedection would not work this
 #   way. Also, I'm not sure whether a ddof keyword parameter can also
 #   handle the hasconst case.
 '''  # noqa:E501
diff --git a/examples/notebooks/chi2_fitting.ipynb b/examples/notebooks/chi2_fitting.ipynb
index 1f348e18f37..77ccf88ab64 100644
--- a/examples/notebooks/chi2_fitting.ipynb
+++ b/examples/notebooks/chi2_fitting.ipynb
@@ -105,7 +105,7 @@
     "\n",
     "Note that `exog` must be a 2-dimensional array with `x` as a column and an extra column of ones. Adding this column of ones means you want to fit the model `y = a * x + b`, leaving it off means you want to fit the model `y = a * x`.\n",
     "\n",
-    "And you have to use the option `cov_type='fixed scale'` to tell `statsmodels` that you really have measurement errors with an absolute scale. If you don't, `statsmodels` will treat the weights as relative weights between the data points and internally re-scale them so that the best-fit model will have `chi**2 / ndf = 1`."
+    "And you have to use the option `cov_type='fixed scale'` to tell `statsmodels` that you really have measurement errors with an absolute scale. If you do not, `statsmodels` will treat the weights as relative weights between the data points and internally re-scale them so that the best-fit model will have `chi**2 / ndf = 1`."
    ]
   },
   {
@@ -171,7 +171,7 @@
    "outputs": [],
    "source": [
     "# You can also use `scipy.optimize.minimize` and write your own cost function.\n",
-    "# This doesn't give you the parameter errors though ... you'd have\n",
+    "# This does not give you the parameter errors though ... you'd have\n",
     "# to estimate the HESSE matrix separately ...\n",
     "from scipy.optimize import minimize\n",
     "\n",
diff --git a/examples/notebooks/contrasts.ipynb b/examples/notebooks/contrasts.ipynb
index 47fe58b7a78..1f7bff08a4e 100644
--- a/examples/notebooks/contrasts.ipynb
+++ b/examples/notebooks/contrasts.ipynb
@@ -32,7 +32,7 @@
    "source": [
     "A categorical variable of K categories, or levels, usually enters a regression as a sequence of K-1 dummy variables. This amounts to a linear hypothesis on the level means. That is, each test statistic for these variables amounts to testing whether the mean for that level is statistically significantly different from the mean of the base category. This dummy coding is called Treatment coding in R parlance, and we will follow this convention. There are, however, different coding methods that amount to different sets of linear hypotheses.\n",
     "\n",
-    "In fact, the dummy coding is not technically a contrast coding. This is because the dummy variables add to one and are not functionally independent of the model's intercept. On the other hand, a set of *contrasts* for a categorical variable with `k` levels is a set of `k-1` functionally independent linear combinations of the factor level means that are also independent of the sum of the dummy variables. The dummy coding isn't wrong *per se*. It captures all of the coefficients, but it complicates matters when the model assumes independence of the coefficients such as in ANOVA. Linear regression models do not assume independence of the coefficients and thus dummy coding is often the only coding that is taught in this context.\n",
+    "In fact, the dummy coding is not technically a contrast coding. This is because the dummy variables add to one and are not functionally independent of the model's intercept. On the other hand, a set of *contrasts* for a categorical variable with `k` levels is a set of `k-1` functionally independent linear combinations of the factor level means that are also independent of the sum of the dummy variables. The dummy coding is not wrong *per se*. It captures all of the coefficients, but it complicates matters when the model assumes independence of the coefficients such as in ANOVA. Linear regression models do not assume independence of the coefficients and thus dummy coding is often the only coding that is taught in this context.\n",
     "\n",
     "To have a look at the contrast matrices in Patsy, we will use data from UCLA ATS. First let's load the data."
    ]
@@ -158,7 +158,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This is a bit of a trick, as the `race` category conveniently maps to zero-based indices. If it does not, this conversion happens under the hood, so this won't work in general but nonetheless is a useful exercise to fix ideas. The below illustrates the output using the three contrasts above"
+    "This is a bit of a trick, as the `race` category conveniently maps to zero-based indices. If it does not, this conversion happens under the hood, so this will not work in general but nonetheless is a useful exercise to fix ideas. The below illustrates the output using the three contrasts above"
    ]
   },
   {
@@ -193,7 +193,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors. Patsy doesn't have the Simple contrast included, but you can easily define your own contrasts. To do so, write a class that contains a code_with_intercept and a code_without_intercept method that returns a patsy.contrast.ContrastMatrix instance"
+    "Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors. Patsy does not have the Simple contrast included, but you can easily define your own contrasts. To do so, write a class that contains a code_with_intercept and a code_without_intercept method that returns a patsy.contrast.ContrastMatrix instance"
    ]
   },
   {
diff --git a/examples/notebooks/generic_mle.ipynb b/examples/notebooks/generic_mle.ipynb
index b78e7d503ee..178fbe61c35 100644
--- a/examples/notebooks/generic_mle.ipynb
+++ b/examples/notebooks/generic_mle.ipynb
@@ -168,7 +168,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Notice that the ``GenericMaximumLikelihood`` class provides automatic differentiation, so we didn't have to provide Hessian or Score functions in order to calculate the covariance estimates."
+    "Notice that the ``GenericMaximumLikelihood`` class provides automatic differentiation, so we did not have to provide Hessian or Score functions in order to calculate the covariance estimates."
    ]
   },
   {
diff --git a/examples/notebooks/glm_weights.ipynb b/examples/notebooks/glm_weights.ipynb
index e889f07674c..1287ee2c68f 100644
--- a/examples/notebooks/glm_weights.ipynb
+++ b/examples/notebooks/glm_weights.ipynb
@@ -298,7 +298,7 @@
     "### condensed using ``var_weights`` instead of ``freq_weights``\n",
     "\n",
     "Next, we compare ``var_weights`` to ``freq_weights``. It is a common practice to incorporate ``var_weights`` when the endogenous variable reflects averages and not identical observations.\n",
-    "I don't see a theoretical reason why it produces the same results (in general).\n",
+    "I do not see a theoretical reason why it produces the same results (in general).\n",
     "\n",
     "This produces the same results but ``df_resid``  differs the ``freq_weights`` example because ``var_weights`` do not change the number of effective observations.  \n"
    ]
diff --git a/examples/notebooks/gls.ipynb b/examples/notebooks/gls.ipynb
index 0350a126bbf..cfbf79891c0 100644
--- a/examples/notebooks/gls.ipynb
+++ b/examples/notebooks/gls.ipynb
@@ -91,7 +91,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    " While we don't have strong evidence that the errors follow an AR(1)\n",
+    " While we do not have strong evidence that the errors follow an AR(1)\n",
     " process we continue"
    ]
   },
diff --git a/examples/notebooks/interactions_anova.ipynb b/examples/notebooks/interactions_anova.ipynb
index 83d697edc37..4f78bc605e1 100644
--- a/examples/notebooks/interactions_anova.ipynb
+++ b/examples/notebooks/interactions_anova.ipynb
@@ -441,7 +441,7 @@
    "source": [
     "try:\n",
     "    jobtest_table = pd.read_table('jobtest.table')\n",
-    "except:  # don't have data already\n",
+    "except:  # do not have data already\n",
     "    url = 'http://stats191.stanford.edu/data/jobtest.table'\n",
     "    jobtest_table = pd.read_table(url)\n",
     "\n",
@@ -752,7 +752,7 @@
     "\n",
     " Types I and II are equivalent under a balanced design.\n",
     "\n",
-    " Don't use Type III with non-orthogonal contrast - ie., Treatment"
+    " Do not use Type III with non-orthogonal contrast - ie., Treatment"
    ]
   },
   {
diff --git a/examples/notebooks/markov_regression.ipynb b/examples/notebooks/markov_regression.ipynb
index 13577834540..217f60f08ec 100644
--- a/examples/notebooks/markov_regression.ipynb
+++ b/examples/notebooks/markov_regression.ipynb
@@ -322,7 +322,7 @@
    "source": [
     "### Switching variances\n",
     "\n",
-    "We can also accomodate switching variances. In particular, we consider the model\n",
+    "We can also accommodate switching variances. In particular, we consider the model\n",
     "\n",
     "$$\n",
     "y_t = \\mu_{S_t} + y_{t-1} \\beta_{S_t} + \\varepsilon_t \\quad \\varepsilon_t \\sim N(0, \\sigma_{S_t}^2)\n",
diff --git a/examples/notebooks/predict.ipynb b/examples/notebooks/predict.ipynb
index 5e38b8ff834..73678408989 100644
--- a/examples/notebooks/predict.ipynb
+++ b/examples/notebooks/predict.ipynb
@@ -159,7 +159,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We use the `I` to indicate use of the Identity transform. Ie., we don't want any expansion magic from using `**2`"
+    "We use the `I` to indicate use of the Identity transform. Ie., we do not want any expansion magic from using `**2`"
    ]
   },
   {
diff --git a/examples/notebooks/regression_plots.ipynb b/examples/notebooks/regression_plots.ipynb
index f8fba74eac7..18c42357ce3 100644
--- a/examples/notebooks/regression_plots.ipynb
+++ b/examples/notebooks/regression_plots.ipynb
@@ -533,7 +533,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "There isn't yet an influence diagnostics method as part of RLM, but we can recreate them. (This depends on the status of [issue #888](https://github.com/statsmodels/statsmodels/issues/808))"
+    "There is not yet an influence diagnostics method as part of RLM, but we can recreate them. (This depends on the status of [issue #888](https://github.com/statsmodels/statsmodels/issues/808))"
    ]
   },
   {
diff --git a/examples/notebooks/robust_models_1.ipynb b/examples/notebooks/robust_models_1.ipynb
index a50dae0349f..a9fff69b7a3 100644
--- a/examples/notebooks/robust_models_1.ipynb
+++ b/examples/notebooks/robust_models_1.ipynb
@@ -908,7 +908,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "* MM estimators are good for this type of problem, unfortunately, we don't yet have these yet. \n",
+    "* MM estimators are good for this type of problem, unfortunately, we do not yet have these yet. \n",
     "* It's being worked on, but it gives a good excuse to look at the R cell magics in the notebook."
    ]
   },
diff --git a/examples/notebooks/statespace_dfm_coincident.ipynb b/examples/notebooks/statespace_dfm_coincident.ipynb
index cda6c67c6f8..0363d827750 100644
--- a/examples/notebooks/statespace_dfm_coincident.ipynb
+++ b/examples/notebooks/statespace_dfm_coincident.ipynb
@@ -767,9 +767,9 @@
     "\n",
     "#### `transform_params` and `untransform_params`\n",
     "\n",
-    "The optimizer selects possibly parameter values in an unconstrained way. That's not usually desired (since variances can't be negative, for example), and `transform_params` is used to transform the unconstrained values used by the optimizer to constrained values appropriate to the model. Variances terms are typically squared (to force them to be positive), and AR lag coefficients are often constrained to lead to a stationary model. `untransform_params` is used for the reverse operation (and is important because starting parameters are usually specified in terms of values appropriate to the model, and we need to convert them to parameters appropriate to the optimizer before we can begin the optimization routine).\n",
+    "The optimizer selects possibly parameter values in an unconstrained way. That's not usually desired (since variances cannot be negative, for example), and `transform_params` is used to transform the unconstrained values used by the optimizer to constrained values appropriate to the model. Variances terms are typically squared (to force them to be positive), and AR lag coefficients are often constrained to lead to a stationary model. `untransform_params` is used for the reverse operation (and is important because starting parameters are usually specified in terms of values appropriate to the model, and we need to convert them to parameters appropriate to the optimizer before we can begin the optimization routine).\n",
     "\n",
-    "Even though we don't need to transform or untransform our new parameters (the loadings can in theory take on any values), we still need to modify this function for two reasons:\n",
+    "Even though we do not need to transform or untransform our new parameters (the loadings can in theory take on any values), we still need to modify this function for two reasons:\n",
     "\n",
     "1. The version in the `DynamicFactor` class is expecting 3 fewer parameters than we have now. At a minimum, we need to handle the three new parameters.\n",
     "2. The version in the `DynamicFactor` class constrains the factor lag coefficients to be stationary as though it was an AR(4) model. Since we actually have an AR(2) model, we need to re-do the constraint. We also set the last two autoregressive coefficients to be zero here.\n",
diff --git a/examples/notebooks/statespace_forecasting.ipynb b/examples/notebooks/statespace_forecasting.ipynb
index b5c47702bc7..d0c6008fe0e 100644
--- a/examples/notebooks/statespace_forecasting.ipynb
+++ b/examples/notebooks/statespace_forecasting.ipynb
@@ -161,7 +161,7 @@
    "outputs": [],
    "source": [
     "fcast_res2 = res.get_forecast(steps=2)\n",
-    "# Note: since we didn't specify the alpha parameter, the\n",
+    "# Note: since we did not specify the alpha parameter, the\n",
     "# confidence level is at the default, 95%\n",
     "print(fcast_res2.summary_frame())"
    ]
@@ -665,7 +665,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "What this means is that you can't specify forecasting steps by dates, and the output of the `forecast` and `get_forecast` methods won't have associated dates. The reason is that without a given frequency, there is no way to determine what date each forecast should be assigned to. In the example above, there is no pattern to the date/time stamps of the index, so there is no way to determine what the next date/time should be (should it be in the morning of 2000-01-02? the afternoon? or maybe not until 2000-01-03?).\n",
+    "What this means is that you cannot specify forecasting steps by dates, and the output of the `forecast` and `get_forecast` methods will not have associated dates. The reason is that without a given frequency, there is no way to determine what date each forecast should be assigned to. In the example above, there is no pattern to the date/time stamps of the index, so there is no way to determine what the next date/time should be (should it be in the morning of 2000-01-02? the afternoon? or maybe not until 2000-01-03?).\n",
     "\n",
     "For example, if we forecast one-step-ahead:"
    ]
diff --git a/examples/notebooks/statespace_sarimax_stata.ipynb b/examples/notebooks/statespace_sarimax_stata.ipynb
index 3c49eb3cb81..a526c6050be 100644
--- a/examples/notebooks/statespace_sarimax_stata.ipynb
+++ b/examples/notebooks/statespace_sarimax_stata.ipynb
@@ -235,7 +235,7 @@
     "(1 - \\phi_1 L)\\Delta y_t = c + (1 + \\theta_1 L + \\theta_2 L^2 + \\theta_3 L^3 + \\theta_4 L^4) \\epsilon_{t}\n",
     "$$\n",
     "\n",
-    "When the specification parameter is given as a maximum degree of the lag polynomial, it implies that all polynomial terms up to that degree are included. Notice that this is *not* the model we want to use, because it would include terms for $\\epsilon_{t-2}$ and $\\epsilon_{t-3}$, which we don't want here.\n",
+    "When the specification parameter is given as a maximum degree of the lag polynomial, it implies that all polynomial terms up to that degree are included. Notice that this is *not* the model we want to use, because it would include terms for $\\epsilon_{t-2}$ and $\\epsilon_{t-3}$, which we do not want here.\n",
     "\n",
     "What we want is a polynomial that has terms for the 1st and 4th degrees, but leaves out the 2nd and 3rd terms. To do that, we need to provide a tuple for the specifiation parameter, where the tuple describes **the lag polynomial itself**. In particular, here we would want to use:\n",
     "\n",
diff --git a/examples/notebooks/statespace_seasonal.ipynb b/examples/notebooks/statespace_seasonal.ipynb
index 437b771727b..1b42d289319 100644
--- a/examples/notebooks/statespace_seasonal.ipynb
+++ b/examples/notebooks/statespace_seasonal.ipynb
@@ -198,7 +198,7 @@
    "source": [
     "### Unobserved components (mixed time and frequency domain modeling)\n",
     "\n",
-    "The second method is an unobserved components model, where the trend is modeled as a fixed intercept and the seasonal components are modeled using 10 constants summing to 0 and trigonometric functions with a primary periodicities of 100 with 2 harmonics total.  Note that this isn't the generating model, as it presupposes that there are more state errors for the shorter seasonal component than in reality. The process for the time series can be written as:\n",
+    "The second method is an unobserved components model, where the trend is modeled as a fixed intercept and the seasonal components are modeled using 10 constants summing to 0 and trigonometric functions with a primary periodicities of 100 with 2 harmonics total.  Note that this is not the generating model, as it presupposes that there are more state errors for the shorter seasonal component than in reality. The process for the time series can be written as:\n",
     "\n",
     "$$\n",
     "\\begin{align}\n",
@@ -246,7 +246,7 @@
    "source": [
     "### Unobserved components (lazy frequency domain modeling)\n",
     "\n",
-    "The third method is an unobserved components model with a fixed intercept and one seasonal component, which is modeled using trigonometric functions with primary periodicity 100 and 50 harmonics. Note that this isn't the generating model, as it presupposes that there are more harmonics then in reality.  Because the variances are tied together, we are not able to drive the estimated covariance of the non-existent harmonics to 0.  What is lazy about this model specification is that we have not bothered to specify the two different seasonal components and instead chosen to model them using a single component with enough harmonics to cover both.  We will not be able to capture any differences in variances between the two true components.  The process for the time series can be written as:\n",
+    "The third method is an unobserved components model with a fixed intercept and one seasonal component, which is modeled using trigonometric functions with primary periodicity 100 and 50 harmonics. Note that this is not the generating model, as it presupposes that there are more harmonics then in reality.  Because the variances are tied together, we are not able to drive the estimated covariance of the non-existent harmonics to 0.  What is lazy about this model specification is that we have not bothered to specify the two different seasonal components and instead chosen to model them using a single component with enough harmonics to cover both.  We will not be able to capture any differences in variances between the two true components.  The process for the time series can be written as:\n",
     "\n",
     "$$\n",
     "\\begin{align}\n",
diff --git a/examples/python/chi2_fitting.py b/examples/python/chi2_fitting.py
index 36176efde46..dead0f5fb2d 100644
--- a/examples/python/chi2_fitting.py
+++ b/examples/python/chi2_fitting.py
@@ -87,7 +87,7 @@
 #
 # And you have to use the option `cov_type='fixed scale'` to tell
 # `statsmodels` that you really have measurement errors with an absolute
-# scale. If you don't, `statsmodels` will treat the weights as relative
+# scale. If you do not, `statsmodels` will treat the weights as relative
 # weights between the data points and internally re-scale them so that the
 # best-fit model will have `chi**2 / ndf = 1`.
 
@@ -122,7 +122,7 @@ def f(x, a, b):
 
 # You can also use `scipy.optimize.minimize` and write your own cost
 # function.
-# This doesn't give you the parameter errors though ... you'd have
+# This does not give you the parameter errors though ... you'd have
 # to estimate the HESSE matrix separately ...
 from scipy.optimize import minimize
 
diff --git a/examples/python/contrasts.py b/examples/python/contrasts.py
index 3a3629f1c4f..d24015ec2ca 100644
--- a/examples/python/contrasts.py
+++ b/examples/python/contrasts.py
@@ -30,7 +30,7 @@
 # *contrasts* for a categorical variable with `k` levels is a set of `k-1`
 # functionally independent linear combinations of the factor level means
 # that are also independent of the sum of the dummy variables. The dummy
-# coding isn't wrong *per se*. It captures all of the coefficients, but it
+# coding is not wrong *per se*. It captures all of the coefficients, but it
 # complicates matters when the model assumes independence of the
 # coefficients such as in ANOVA. Linear regression models do not assume
 # independence of the coefficients and thus dummy coding is often the only
@@ -80,7 +80,7 @@
 
 # This is a bit of a trick, as the `race` category conveniently maps to
 # zero-based indices. If it does not, this conversion happens under the
-# hood, so this won't work in general but nonetheless is a useful exercise
+# hood, so this will not work in general but nonetheless is a useful exercise
 # to fix ideas. The below illustrates the output using the three contrasts
 # above
 
@@ -96,7 +96,7 @@
 
 # Like Treatment Coding, Simple Coding compares each level to a fixed
 # reference level. However, with simple coding, the intercept is the grand
-# mean of all the levels of the factors. Patsy doesn't have the Simple
+# mean of all the levels of the factors. Patsy does not have the Simple
 # contrast included, but you can easily define your own contrasts. To do so,
 # write a class that contains a code_with_intercept and a
 # code_without_intercept method that returns a patsy.contrast.ContrastMatrix
diff --git a/examples/python/generic_mle.py b/examples/python/generic_mle.py
index 8f2370e54c6..e00d06a4427 100644
--- a/examples/python/generic_mle.py
+++ b/examples/python/generic_mle.py
@@ -70,7 +70,7 @@ def loglike(self, params):
 print(sm_probit_manual.cov_params())
 
 # Notice that the ``GenericMaximumLikelihood`` class provides automatic
-# differentiation, so we didn't have to provide Hessian or Score functions
+# differentiation, so we did not have to provide Hessian or Score functions
 # in order to calculate the covariance estimates.
 
 #
diff --git a/examples/python/glm_weights.py b/examples/python/glm_weights.py
index a4c5be364a8..de7185abe9e 100644
--- a/examples/python/glm_weights.py
+++ b/examples/python/glm_weights.py
@@ -146,7 +146,7 @@ def merge_tuple(tpl):
 # Next, we compare ``var_weights`` to ``freq_weights``. It is a common
 # practice to incorporate ``var_weights`` when the endogenous variable
 # reflects averages and not identical observations.
-# I don't see a theoretical reason why it produces the same results (in
+# I do not see a theoretical reason why it produces the same results (in
 # general).
 #
 # This produces the same results but ``df_resid``  differs the
diff --git a/examples/python/gls.py b/examples/python/gls.py
index 2f7dc75f5c4..975faf2e97b 100644
--- a/examples/python/gls.py
+++ b/examples/python/gls.py
@@ -39,7 +39,7 @@
 print(resid_fit.tvalues[1])
 print(resid_fit.pvalues[1])
 
-#  While we don't have strong evidence that the errors follow an AR(1)
+#  While we do not have strong evidence that the errors follow an AR(1)
 #  process we continue
 
 rho = resid_fit.params[1]
diff --git a/examples/python/interactions_anova.py b/examples/python/interactions_anova.py
index 01a2dcf743f..cd820312dc4 100644
--- a/examples/python/interactions_anova.py
+++ b/examples/python/interactions_anova.py
@@ -238,7 +238,7 @@
 
 try:
     jobtest_table = pd.read_table('jobtest.table')
-except:  # don't have data already
+except:  # do not have data already
     url = 'http://stats191.stanford.edu/data/jobtest.table'
     jobtest_table = pd.read_table(url)
 
@@ -432,7 +432,7 @@
 #
 #  Types I and II are equivalent under a balanced design.
 #
-#  Don't use Type III with non-orthogonal contrast - ie., Treatment
+#  Do not use Type III with non-orthogonal contrast - ie., Treatment
 
 sum_lm = ols(
     'np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Sum)', data=kt).fit()
diff --git a/examples/python/markov_regression.py b/examples/python/markov_regression.py
index e2f3408fc90..83a254228b9 100644
--- a/examples/python/markov_regression.py
+++ b/examples/python/markov_regression.py
@@ -185,7 +185,7 @@
 
 # ### Switching variances
 #
-# We can also accomodate switching variances. In particular, we consider
+# We can also accommodate switching variances. In particular, we consider
 # the model
 #
 # $$
diff --git a/examples/python/predict.py b/examples/python/predict.py
index eb9dc3b2512..aacd906fa59 100644
--- a/examples/python/predict.py
+++ b/examples/python/predict.py
@@ -66,7 +66,7 @@
 
 res = ols("y ~ x1 + np.sin(x1) + I((x1-5)**2)", data=data).fit()
 
-# We use the `I` to indicate use of the Identity transform. Ie., we don't
+# We use the `I` to indicate use of the Identity transform. Ie., we do not
 # want any expansion magic from using `**2`
 
 res.params
diff --git a/examples/python/regression_plots.py b/examples/python/regression_plots.py
index 1729b758109..e24b785bb2d 100644
--- a/examples/python/regression_plots.py
+++ b/examples/python/regression_plots.py
@@ -237,7 +237,7 @@
 # data=dta, M=sm.robust.norms.TukeyBiweight()).fit(conv="weights")
 #print(rob_crime_model.summary())
 
-# There isn't yet an influence diagnostics method as part of RLM, but we
+# There is not yet an influence diagnostics method as part of RLM, but we
 # can recreate them. (This depends on the status of [issue
 # #888](https://github.com/statsmodels/statsmodels/issues/808))
 
diff --git a/examples/python/robust_models_1.py b/examples/python/robust_models_1.py
index 8eb566bc23a..9a11e6836fd 100644
--- a/examples/python/robust_models_1.py
+++ b/examples/python/robust_models_1.py
@@ -291,7 +291,7 @@ def plot_weights(support, weights_func, xlabels, xticks):
 abline_plot(model_results=wls_model, ax=ax, color='green')
 
 # * MM estimators are good for this type of problem, unfortunately, we
-# don't yet have these yet.
+# do not yet have these yet.
 # * It's being worked on, but it gives a good excuse to look at the R cell
 # magics in the notebook.
 
diff --git a/examples/python/statespace_dfm_coincident.py b/examples/python/statespace_dfm_coincident.py
index c494e81f243..09c7708fc2f 100644
--- a/examples/python/statespace_dfm_coincident.py
+++ b/examples/python/statespace_dfm_coincident.py
@@ -744,7 +744,7 @@ def update(self, params, transformed=True, complex_step=False):
 # #### `transform_params` and `untransform_params`
 #
 # The optimizer selects possibly parameter values in an unconstrained way.
-# That's not usually desired (since variances can't be negative, for
+# That's not usually desired (since variances cannot be negative, for
 # example), and `transform_params` is used to transform the unconstrained
 # values used by the optimizer to constrained values appropriate to the
 # model. Variances terms are typically squared (to force them to be
@@ -755,7 +755,7 @@ def update(self, params, transformed=True, complex_step=False):
 # parameters appropriate to the optimizer before we can begin the
 # optimization routine).
 #
-# Even though we don't need to transform or untransform our new parameters
+# Even though we do not need to transform or untransform our new parameters
 # (the loadings can in theory take on any values), we still need to modify
 # this function for two reasons:
 #
diff --git a/examples/python/statespace_sarimax_stata.py b/examples/python/statespace_sarimax_stata.py
index f901d8a01d7..76f14c9ae66 100644
--- a/examples/python/statespace_sarimax_stata.py
+++ b/examples/python/statespace_sarimax_stata.py
@@ -215,7 +215,7 @@
 # polynomial, it implies that all polynomial terms up to that degree are
 # included. Notice that this is *not* the model we want to use, because it
 # would include terms for $\epsilon_{t-2}$ and $\epsilon_{t-3}$, which we
-# don't want here.
+# do not want here.
 #
 # What we want is a polynomial that has terms for the 1st and 4th degrees,
 # but leaves out the 2nd and 3rd terms. To do that, we need to provide a
diff --git a/examples/python/statespace_seasonal.py b/examples/python/statespace_seasonal.py
index 5a59c7da0ae..26eb6835bb7 100644
--- a/examples/python/statespace_seasonal.py
+++ b/examples/python/statespace_seasonal.py
@@ -175,7 +175,7 @@ def simulate_seasonal_term(periodicity,
 # The second method is an unobserved components model, where the trend is
 # modeled as a fixed intercept and the seasonal components are modeled using
 # 10 constants summing to 0 and trigonometric functions with a primary
-# periodicities of 100 with 2 harmonics total.  Note that this isn't the
+# periodicities of 100 with 2 harmonics total.  Note that this is not the
 # generating model, as it presupposes that there are more state errors for
 # the shorter seasonal component than in reality. The process for the time
 # series can be written as:
@@ -224,7 +224,7 @@ def simulate_seasonal_term(periodicity,
 # The third method is an unobserved components model with a fixed
 # intercept and one seasonal component, which is modeled using trigonometric
 # functions with primary periodicity 100 and 50 harmonics. Note that this
-# isn't the generating model, as it presupposes that there are more
+# is not the generating model, as it presupposes that there are more
 # harmonics then in reality.  Because the variances are tied together, we
 # are not able to drive the estimated covariance of the non-existent
 # harmonics to 0.  What is lazy about this model specification is that we
diff --git a/setup.cfg b/setup.cfg
index 1314dba5e4f..badaed4d4e4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -213,12 +213,12 @@ exclude_lines =
     # Have to re-enable the standard pragma
     pragma: no cover
 
-    # Don't complain about missing debug-only code:
+    # Do not complain about missing debug-only code:
     def __repr__
     if self\.debug
     if pdf_output:
 
-    # Don't complain if tests don't hit defensive assertion code:
+    # Do not complain if tests do not hit defensive assertion code:
     raise AssertionError
     raise NotImplementedError
     except NotImplementedError
@@ -229,7 +229,7 @@ exclude_lines =
     # Ignore pass
     pass
 
-    # Don't complain if non-runnable code isn't run:
+    # Do not complain if non-runnable code is not run:
     if 0:
     if __name__ == .__main__.:
 
diff --git a/statsmodels/_version.py b/statsmodels/_version.py
index da393298c28..0f646a2c723 100644
--- a/statsmodels/_version.py
+++ b/statsmodels/_version.py
@@ -126,14 +126,14 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     if verbose:
         print("Tried directories %s but none started with prefix %s" %
               (str(rootdirs), parentdir_prefix))
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+    raise NotThisMethod("rootdir does not start with parentdir_prefix")
 
 
 @register_vcs_handler("git", "get_keywords")
 def git_get_keywords(versionfile_abs):
     """Extract version information from the given file."""
     # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
+    # keywords. When used from setup.py, we do not want to import _version.py,
     # so we do it with a regexp instead. This function is not used from
     # _version.py.
     keywords = {}
@@ -218,7 +218,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     """Get version from 'git describe' in the root of the source tree.
 
     This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
+    expanded, and _version.py has not already been rewritten with a short
     version string, meaning we're inside a checked out source tree.
     """
     GITS = ["git"]
@@ -233,7 +233,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         raise NotThisMethod("'git rev-parse --git-dir' returned error")
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    # if there is not one, this yields HEX[-dirty] (no NUM)
     describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
                                           "--always", "--long",
                                           "--match", "%s*" % tag_prefix],
@@ -277,9 +277,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         full_tag = mo.group(1)
         if not full_tag.startswith(tag_prefix):
             if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
+                fmt = "tag '%s' does not start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+            pieces["error"] = ("tag '%s' does not start with prefix '%s'"
                                % (full_tag, tag_prefix))
             return pieces
         pieces["closest-tag"] = full_tag[len(tag_prefix):]
@@ -306,7 +306,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
 
 def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
+    """Return a + if we do not already have one, else return a ."""
     if "+" in pieces.get("closest-tag", ""):
         return "."
     return "+"
@@ -358,7 +358,7 @@ def render_pep440_post(pieces):
 
     The ".dev0" means dirty. Note that .dev0 sorts backwards
     (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
+    but you should not be releasing software with -dirty anyways.
 
     Exceptions:
     1: no tags. 0.postDISTANCE[.dev0]
@@ -478,7 +478,7 @@ def get_versions():
     """Get version information or return default if unable to do so."""
     # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
     # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # py2exe/bbfreeze/non-CPython implementations do not do __file__, in which
     # case we can only use expanded keywords.
 
     cfg = get_config()
diff --git a/statsmodels/base/_constraints.py b/statsmodels/base/_constraints.py
index c19014bfecf..8d1afea70c6 100644
--- a/statsmodels/base/_constraints.py
+++ b/statsmodels/base/_constraints.py
@@ -39,7 +39,7 @@ class TransformRestriction(object):
         b1 + b2 = 0 and b1 + 2*b2 = 0, implies that b2 = 0.
 
     The transformation applied from full to reduced parameter space does not
-    raise and exception if the constraint doesn't hold.
+    raise and exception if the constraint does not hold.
     TODO: maybe change this, what's the behavior in this case?
 
 
@@ -246,7 +246,7 @@ def fit_constrained(model, constraint_matrix, constraint_values,
     if start_params is not None:
         start_params =  transf.reduce(start_params)
 
-    #need copy, because we don't want to change it, we don't need deepcopy
+    #need copy, because we do not want to change it, we do not need deepcopy
     import copy
     init_kwds = copy.copy(self._get_init_kwds())
 
diff --git a/statsmodels/base/_parameter_inference.py b/statsmodels/base/_parameter_inference.py
index 4699cc62eb9..c5591fb9446 100644
--- a/statsmodels/base/_parameter_inference.py
+++ b/statsmodels/base/_parameter_inference.py
@@ -229,7 +229,7 @@ def score_test(self, exog_extra=None, params_constrained=None,
         # cov_score_test_inv = cov_lm_robust(score, r_matrix, hinv,
         #                                   cov_score, cov_params=None)
     elif cov_type.upper() == 'V':
-        # TODO: this doesn't work, V in fit_constrained results is singular
+        # TODO: this does not work, V in fit_constrained results is singular
         # we need cov_params without the zeros in it
         hinv = -np.linalg.inv(hessian)
         cov_score = nobs * np.cov(score_obs.T)
diff --git a/statsmodels/base/_screening.py b/statsmodels/base/_screening.py
index 18771968dad..9b7e8727290 100644
--- a/statsmodels/base/_screening.py
+++ b/statsmodels/base/_screening.py
@@ -267,7 +267,7 @@ def screen_exog(self, exog, endog=None, maxiter=100, method='bfgs',
         keep = np.ones(k_keep, np.bool_)
         idx_excl = np.arange(k_keep, k_vars)
         mod_pen = model_class(endog, x0, **self.init_kwds)
-        # don't penalize initial estimate
+        # do not penalize initial estimate
         mod_pen.pen_weight = 0
         res_pen = mod_pen.fit(**fit_kwds)
         start_params = res_pen.params
diff --git a/statsmodels/base/covtype.py b/statsmodels/base/covtype.py
index 18ff0c2dfae..6b0b5e85dfd 100644
--- a/statsmodels/base/covtype.py
+++ b/statsmodels/base/covtype.py
@@ -201,7 +201,7 @@ def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
     if use_self:
         res = self
     else:
-        # this doesn't work for most models, use raw instance instead from fit
+        # this does not work for most models, use raw instance instead from fit
         res = self.__class__(self.model, self.params,
                    normalized_cov_params=self.normalized_cov_params,
                    scale=self.scale)
@@ -218,7 +218,7 @@ def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
         df_correction = kwds.get('df_correction', None)
         # TODO: check also use_correction, do I need all combinations?
         if df_correction is not False: # i.e. in [None, True]:
-            # user didn't explicitely set it to False
+            # user did not explicitely set it to False
             adjust_df = True
 
     res.cov_kwds['adjust_df'] = adjust_df
@@ -235,7 +235,7 @@ def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
 
         res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None)
         if res.cov_params_default is None:
-            # results classes that don't have cov_HCx attribute
+            # results classes that do not have cov_HCx attribute
             res.cov_params_default = sw.cov_white_simple(self,
                                                          use_correction=False)
     elif cov_type.lower() == 'hac':
diff --git a/statsmodels/base/data.py b/statsmodels/base/data.py
index 940ea3b739a..45a58cf695c 100644
--- a/statsmodels/base/data.py
+++ b/statsmodels/base/data.py
@@ -24,7 +24,7 @@ def _asarray_2d_null_rows(x):
     Makes sure input is an array and is 2d. Makes sure output is 2d. True
     indicates a null in the rows of 2d x.
     """
-    #Have to have the asarrays because isnull doesn't account for array_like
+    #Have to have the asarrays because isnull does not account for array_like
     #input
     x = np.asarray(x)
     if x.ndim == 1:
@@ -148,7 +148,7 @@ def _handle_constant(self, hasconst):
                         break
                     values.append(value)
                 else:
-                    # we didn't break, no column of ones
+                    # we did not break, no column of ones
                     pos = (np.array(values) != 0)
                     if pos.any():
                         # take the first nonzero column
@@ -160,7 +160,7 @@ def _handle_constant(self, hasconst):
             elif self.k_constant == 0:
                 check_implicit = True
             else:
-                # shouldn't be here
+                # should not be here
                 pass
 
             if check_implicit and not hasconst:
@@ -174,7 +174,7 @@ def _handle_constant(self, hasconst):
                 self.const_idx = None
             elif hasconst:
                 # Ensure k_constant is 1 any time hasconst is True
-                # even if one isn't found
+                # even if one is not found
                 self.k_constant = 1
 
     @classmethod
@@ -232,7 +232,7 @@ def handle_missing(cls, endog, exog, missing, **kwargs):
                     combined_2d_names += [key]
                 else:
                     raise ValueError("Arrays with more than 2 dimensions "
-                                     "aren't yet handled")
+                                     "are not yet handled")
 
         if missing_idx is not None:
             nan_mask = missing_idx
@@ -261,7 +261,7 @@ def handle_missing(cls, endog, exog, missing, **kwargs):
             if combined_2d:
                 nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)
 
-        if not np.any(nan_mask):  # no missing don't do anything
+        if not np.any(nan_mask):  # no missing do not do anything
             combined = dict(zip(combined_names, combined))
             if combined_2d:
                 combined.update(dict(zip(combined_2d_names, combined_2d)))
@@ -548,7 +548,7 @@ def attach_generic_columns_2d(self, result, rownames, colnames=None):
 
     def attach_columns(self, result):
         # this can either be a 1d array or a scalar
-        # don't squeeze because it might be a 2d row array
+        # do not squeeze because it might be a 2d row array
         # if it needs a squeeze, the bug is elsewhere
         if result.ndim <= 1:
             return Series(result, index=self.param_names)
diff --git a/statsmodels/base/distributed_estimation.py b/statsmodels/base/distributed_estimation.py
index 04cb646d025..f12fb532c5a 100644
--- a/statsmodels/base/distributed_estimation.py
+++ b/statsmodels/base/distributed_estimation.py
@@ -329,7 +329,7 @@ def _helper_fit_partition(self, pnum, endog, exog, fit_kwds,
                           init_kwds_e={}):
     """handles the model fitting for each machine. NOTE: this
     is primarily handled outside of DistributedModel because
-    joblib can't handle class methods.
+    joblib cannot handle class methods.
 
     Parameters
     ----------
diff --git a/statsmodels/base/elastic_net.py b/statsmodels/base/elastic_net.py
index b9616578d59..84313d97501 100644
--- a/statsmodels/base/elastic_net.py
+++ b/statsmodels/base/elastic_net.py
@@ -151,7 +151,7 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100,
     params_zero = np.zeros(len(params), dtype=bool)
 
     init_args = model._get_init_kwds()
-    # we don't need a copy of init_args because get_init_kwds provides new dict
+    # we do not need a copy of init_args b/c get_init_kwds provides new dict
     init_args['hasconst'] = False
     model_offset = init_args.pop('offset', None)
     if 'exposure' in init_args and init_args['exposure'] is not None:
@@ -171,7 +171,7 @@ def fit_elasticnet(model, method="coord_descent", maxiter=100,
         for k in range(k_exog):
 
             # Under the active set method, if a parameter becomes
-            # zero we don't try to change it again.
+            # zero we do not try to change it again.
             # TODO : give the user the option to switch this off
             if params_zero[k]:
                 continue
diff --git a/statsmodels/base/l1_cvxopt.py b/statsmodels/base/l1_cvxopt.py
index 9c262a03e22..93148c51abe 100644
--- a/statsmodels/base/l1_cvxopt.py
+++ b/statsmodels/base/l1_cvxopt.py
@@ -40,7 +40,7 @@ def fit_l1_cvxopt_cp(
     auto_trim_tol : float
         For sue when trim_mode == 'auto'.  Use
     qc_tol : float
-        Print warning and don't allow auto trim when (ii) in "Theory" (above)
+        Print warning and do not allow auto trim when (ii) in "Theory" (above)
         is violated by this much.
     qc_verbose : Boolean
         If true, print out a full QC report upon failure
@@ -116,7 +116,7 @@ def F(x=None, z=None):
         auto_trim_tol)
 
     ### Pack up return values for statsmodels
-    # TODO These retvals are returned as mle_retvals...but the fit wasn't ML
+    # TODO These retvals are returned as mle_retvals...but the fit was not ML
     if full_output:
         fopt = f_0(x)
         gopt = float('nan')  # Objective is non-differentiable
diff --git a/statsmodels/base/l1_slsqp.py b/statsmodels/base/l1_slsqp.py
index 06aff594799..1a20be76aee 100644
--- a/statsmodels/base/l1_slsqp.py
+++ b/statsmodels/base/l1_slsqp.py
@@ -41,7 +41,7 @@ def fit_l1_slsqp(
     auto_trim_tol : float
         For sue when trim_mode == 'auto'.  Use
     qc_tol : float
-        Print warning and don't allow auto trim when (ii) in "Theory" (above)
+        Print warning and do not allow auto trim when (ii) in "Theory" (above)
         is violated by this much.
     qc_verbose : Boolean
         If true, print out a full QC report upon failure
@@ -94,7 +94,7 @@ def fit_l1_slsqp(
         auto_trim_tol)
 
     ### Pack up return values for statsmodels optimizers
-    # TODO These retvals are returned as mle_retvals...but the fit wasn't ML.
+    # TODO These retvals are returned as mle_retvals...but the fit was not ML.
     # This could be confusing someday.
     if full_output:
         x_full, fx, its, imode, smode = results
diff --git a/statsmodels/base/l1_solvers_common.py b/statsmodels/base/l1_solvers_common.py
index 747e8c7acff..e0bb7fa23fc 100644
--- a/statsmodels/base/l1_solvers_common.py
+++ b/statsmodels/base/l1_solvers_common.py
@@ -96,7 +96,7 @@ def do_trim_params(params, k_params, alpha, score, passed, trim_mode,
     Trims (set to zero) params that are zero at the theoretical minimum.
     Uses heuristics to account for the solver not actually finding the minimum.
 
-    In all cases, if alpha[i] == 0, then don't trim the ith param.
+    In all cases, if alpha[i] == 0, then do not trim the ith param.
     In all cases, do nothing with the added variables.
 
     Parameters
@@ -122,7 +122,7 @@ def do_trim_params(params, k_params, alpha, score, passed, trim_mode,
     auto_trim_tol : float
         For sue when trim_mode == 'auto'.  Use
     qc_tol : float
-        Print warning and don't allow auto trim when (ii) in "Theory" (above)
+        Print warning and do not allow auto trim when (ii) in "Theory" (above)
         is violated by this much.
 
     Returns
diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py
index 979c357b7db..13c28f604a6 100644
--- a/statsmodels/base/model.py
+++ b/statsmodels/base/model.py
@@ -74,10 +74,10 @@ def __init__(self, endog, exog=None, **kwargs):
         self.endog = self.data.endog
         self._data_attr = []
         self._data_attr.extend(['exog', 'endog', 'data.exog', 'data.endog'])
-        if 'formula' not in kwargs:  # won't be able to unpickle without these
+        if 'formula' not in kwargs:  # will not be able to unpickle without these
             self._data_attr.extend(['data.orig_endog', 'data.orig_exog'])
         # store keys for extras if we need to recreate model instance
-        # we don't need 'missing', maybe we need 'hasconst'
+        # we do not need 'missing', maybe we need 'hasconst'
         self._init_keys = list(kwargs.keys())
         if hasconst is not None:
             self._init_keys.append('hasconst')
@@ -96,7 +96,7 @@ def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
         for key in kwargs:
             if key in ['design_info', 'formula']:  # leave attached to data
                 continue
-            # pop so we don't start keeping all these twice or references
+            # pop so we do not start keeping all these twice or references
             try:
                 setattr(self, key, data.__dict__.pop(key))
             except KeyError:  # panel already pops keys in data handling
@@ -437,7 +437,7 @@ def fit(self, start_params=None, method='newton', maxiter=100,
                                  "be specified")
 
         # TODO: separate args from nonarg taking score and hessian, ie.,
-        # user-supplied and numerically evaluated estimate frprime doesn't take
+        # user-supplied and numerically evaluated estimate frprime does not take
         # args in most (any?) of the optimize function
 
         nobs = self.endog.shape[0]
@@ -587,8 +587,8 @@ def _fit_zeros(self, keep_index=None, start_params=None,
         # create dummy results Instance, TODO: wire up properly
         # TODO: this could be moved into separate private method if needed
         # discrete L1 fit_regularized doens't reestimate AFAICS
-        # RLM doesn't have method, disp nor warn_convergence keywords
-        # OLS, WLS swallows extra kwds with **kwargs, but doesn't have method='nm'
+        # RLM does not have method, disp nor warn_convergence keywords
+        # OLS, WLS swallows extra kwds with **kwargs, but does not have method='nm'
         try:
             # Note: addding full_output=False causes exceptions
             res = self.fit(maxiter=0, disp=0, method='nm', skip_hessian=True,
@@ -701,7 +701,7 @@ class GenericLikelihoodModel(LikelihoodModel):
     and a Hessian is 'newton'
 
     If they are not overwritten by a subclass, then numerical gradient,
-    Jacobian and Hessian of the log-likelihood are caclulated by numerical
+    Jacobian and Hessian of the log-likelihood are calculated by numerical
     forward differentiation. This might results in some cases in precision
     problems, and the Hessian might not be positive definite. Even if the
     Hessian is not positive definite the covariance matrix of the parameter
@@ -748,7 +748,7 @@ def __init__(self, endog, exog=None, loglike=None, score=None,
         super(GenericLikelihoodModel, self).__init__(endog, exog,
                                                      missing=missing)
 
-        # this won't work for ru2nmnl, maybe np.ndim of a dict?
+        # this will not work for ru2nmnl, maybe np.ndim of a dict?
         if exog is not None:
             self.nparams = (exog.shape[1] if np.ndim(exog) == 2 else 1)
 
@@ -917,7 +917,7 @@ def fit(self, start_params=None, method='nm', maxiter=500, full_output=1,
                 self._set_extra_params_names(['par%d' % i
                                               for i in range(-k_miss)])
             else:
-                # I don't want to raise after we have already fit()
+                # I do not want to raise after we have already fit()
                 import warnings
                 warnings.warn('more exog_names than parameters', ValueWarning)
 
@@ -1245,7 +1245,7 @@ def __init__(self, model, params, normalized_cov_params=None, scale=1.,
                 if cov_kwds is None:
                     cov_kwds = {}
                 use_t = self.use_t
-                # TODO: we shouldn't need use_t in get_robustcov_results
+                # TODO: we should not need use_t in get_robustcov_results
                 get_robustcov_results(self, cov_type=cov_type, use_self=True,
                                       use_t=use_t, **cov_kwds)
 
@@ -1268,7 +1268,7 @@ def _get_robustcov_results(self, cov_type='nonrobust', use_self=True,
                              'covariance matrix of the errors is correctly ' +
                              'specified.'}
         else:
-            # TODO: we shouldn't need use_t in get_robustcov_results
+            # TODO: we should not need use_t in get_robustcov_results
             get_robustcov_results(self, cov_type=cov_type, use_self=True,
                                   use_t=use_t, **cov_kwds)
 
@@ -1755,7 +1755,7 @@ def wald_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None,
                               'rank is %d' % (J, J_), ValueWarning)
                 J = J_
 
-        # TODO streamline computation, we don't need to compute J if given
+        # TODO streamline computation, we do not need to compute J if given
         if df_constraints is not None:
             # let caller override J by df_constraint
             J = df_constraints
diff --git a/statsmodels/base/optimizer.py b/statsmodels/base/optimizer.py
index 15ebb29a796..99d5894ef6f 100644
--- a/statsmodels/base/optimizer.py
+++ b/statsmodels/base/optimizer.py
@@ -241,7 +241,7 @@ def _fit_constrained(self, params):
         raise NotImplementedError
 
     def _fit_regularized(self, params):
-        # TODO: code won't necessarily be general here. 3 options.
+        # TODO: code will not necessarily be general here. 3 options.
         # 1) setup for scipy.optimize.fmin_sqlsqp
         # 2) setup for cvxopt
         # 3) setup for openopt
@@ -592,7 +592,7 @@ def _fit_basinhopping(f, score, start_params, fargs, kwargs, disp=True,
     minimizer_kwargs['args'] = fargs
     minimizer_kwargs['jac'] = score
     method = minimizer_kwargs.get('method', None)
-    if method and method != 'L-BFGS-B': # l_bfgs_b doesn't take a hessian
+    if method and method != 'L-BFGS-B': # l_bfgs_b does not take a hessian
         minimizer_kwargs['hess'] = hess
 
     retvals = optimize.basinhopping(f, start_params,
diff --git a/statsmodels/base/tests/test_data.py b/statsmodels/base/tests/test_data.py
index 1e84b327c48..5368005a471 100644
--- a/statsmodels/base/tests/test_data.py
+++ b/statsmodels/base/tests/test_data.py
@@ -12,7 +12,7 @@
 from statsmodels.discrete.discrete_model import Logit
 
 
-# FIXME: don't leave commented-out, enable or move/remove
+# FIXME: do not leave commented-out, enable or move/remove
 # class TestDates(object):
 #    @classmethod
 #    def setup_class(cls):
@@ -434,7 +434,7 @@ def test_alignment():
 
     endog = gs_l_realinv
 
-    # re-index because they won't conform to lint
+    # re-index because they will not conform to lint
     realgdp = gs_l_realgdp.reindex(lint.index, method='bfill')
     data = dict(const=np.ones_like(lint), lrealgdp=realgdp, lint=lint)
     exog = pd.DataFrame(data)
@@ -860,7 +860,7 @@ def test_dtype_object():
 
 def test_formula_missing_extra_arrays():
     np.random.seed(1)
-    # because patsy can't turn off missing data-handling as of 0.3.0, we need
+    # because patsy cannot turn off missing data-handling as of 0.3.0, we need
     # separate tests to make sure that missing values are handled correctly
     # when going through formulas
 
diff --git a/statsmodels/base/tests/test_generic_methods.py b/statsmodels/base/tests/test_generic_methods.py
index 3826bcbe9dd..bc69322fc51 100644
--- a/statsmodels/base/tests/test_generic_methods.py
+++ b/statsmodels/base/tests/test_generic_methods.py
@@ -118,7 +118,7 @@ def test_zero_constrained(self):
         assert_allclose(pvals1, res2.pvalues, rtol=tol, atol=tol)
 
         if hasattr(res1, 'resid'):
-            # discrete models, Logit don't have `resid` yet
+            # discrete models, Logit do not have `resid` yet
             # atol discussion at gh-5158
             rtol = 1e-10
             atol = 1e-12
@@ -263,7 +263,7 @@ def test_zero_collinear(self):
             assert_allclose(pvals1, res2.pvalues, rtol=1e-6, atol=1e-30)
 
             if hasattr(res1, 'resid'):
-                # discrete models, Logit don't have `resid` yet
+                # discrete models, Logit do not have `resid` yet
                 assert_allclose(res1.resid, res2.resid, rtol=1e-5, atol=1e-10)
 
             ex = res1.model.exog.mean(0)
diff --git a/statsmodels/base/tests/test_predict.py b/statsmodels/base/tests/test_predict.py
index aa188564f8d..1acc717023a 100644
--- a/statsmodels/base/tests/test_predict.py
+++ b/statsmodels/base/tests/test_predict.py
@@ -93,7 +93,7 @@ def setup_class(cls):
         x = np.random.randn(nobs, 3)
         y = x.sum(1) + np.random.randn(nobs)
         index = ['obs%02d' % i for i in range(nobs)]
-        # add one extra column to check that it doesn't matter
+        # add one extra column to check that it does not matter
         cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4),
                                 columns='y var1 var2 var3'.split(),
                                 index=index)
@@ -110,7 +110,7 @@ def setup_class(cls):
         x = np.random.randn(nobs, 3)
         y = x.sum(1) + np.random.randn(nobs)
         index = ['obs%02d' % i for i in range(nobs)]
-        # add one extra column to check that it doesn't matter
+        # add one extra column to check that it does not matter
         cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4),
                                 columns='y var1 var2 var3'.split(),
                                 index=index)
diff --git a/statsmodels/datasets/heart/data.py b/statsmodels/datasets/heart/data.py
index de69102c02a..4be15cd3e0b 100644
--- a/statsmodels/datasets/heart/data.py
+++ b/statsmodels/datasets/heart/data.py
@@ -7,7 +7,7 @@
 
 TITLE       = """Transplant Survival Data"""
 
-SOURCE      = """ Miller, R. (1976). Least squares regression with censored dara. Biometrica, 63 (3). 449-464.
+SOURCE      = """Miller, R. (1976). Least squares regression with censored data. Biometrica, 63 (3). 449-464.
 
 """
 
diff --git a/statsmodels/datasets/utils.py b/statsmodels/datasets/utils.py
index 69acdb40ab9..fbb9ef3454f 100644
--- a/statsmodels/datasets/utils.py
+++ b/statsmodels/datasets/utils.py
@@ -37,7 +37,7 @@ def webuse(data, baseurl='https://www.stata-press.com/data/r11/', as_df=True):
 
     Notes
     -----
-    Make sure baseurl has trailing forward slash. Doesn't do any
+    Make sure baseurl has trailing forward slash. Does not do any
     error checking in response URLs.
     """
     url = urljoin(baseurl, data+'.dta')
@@ -148,7 +148,7 @@ def _urlopen_cached(url, cache):
         except:
             pass
 
-    # not using the cache or didn't find it in cache
+    # not using the cache or did not find it in cache
     if not from_cache:
         data = urlopen(url, timeout=3).read()
         if cache is not None:  # then put it in the cache
@@ -249,7 +249,7 @@ def get_data_home(data_home=None):
     in the user home folder.
 
     Alternatively, it can be set by the 'STATSMODELS_DATA' environment
-    variable or programatically by giving an explit folder path. The
+    variable or programatically by giving an explicit folder path. The
     '~' symbol is expanded to the user home folder.
 
     If the folder does not already exist, it is automatically created.
diff --git a/statsmodels/discrete/conditional_models.py b/statsmodels/discrete/conditional_models.py
index ebd478f384d..84ad1d515c6 100644
--- a/statsmodels/discrete/conditional_models.py
+++ b/statsmodels/discrete/conditional_models.py
@@ -550,7 +550,7 @@ def fit(self,
             c = self.k_cat - 1
             start_params = np.random.normal(size=q * c)
 
-        # Don't call super(...).fit because it can't handle the 2d-params.
+        # Do not call super(...).fit because it cannot handle the 2d-params.
         rslt = base.LikelihoodModel.fit(
             self,
             start_params=start_params,
@@ -564,7 +564,7 @@ def fit(self,
         rslt = MultinomialResults(self, rslt)
 
         # Not clear what the null likelihood should be, there is no intercept
-        # so the null model isn't clearly defined.  This is needed for summary
+        # so the null model is not clearly defined.  This is needed for summary
         # to work.
         rslt.set_null_options(llnull=np.nan)
 
diff --git a/statsmodels/discrete/discrete_margins.py b/statsmodels/discrete/discrete_margins.py
index 958cd1b7c85..7090350a87c 100644
--- a/statsmodels/discrete/discrete_margins.py
+++ b/statsmodels/discrete/discrete_margins.py
@@ -71,7 +71,7 @@ def _get_dummy_index(X, const_idx):
     dummy_ind = _isdummy(X)
     dummy = True
 
-    if dummy_ind.size == 0: # don't waste your time
+    if dummy_ind.size == 0: # do not waste your time
         dummy = False
         dummy_ind = None # this gets passed to stand err func
     return dummy_ind, dummy
@@ -106,7 +106,7 @@ def _get_count_index(X, const_idx):
     count_ind = _iscount(X)
     count = True
 
-    if count_ind.size == 0: # don't waste your time
+    if count_ind.size == 0: # do not waste your time
         count = False
         count_ind = None # for stand err func
     return count_ind, count
@@ -152,7 +152,7 @@ def _get_count_effects(effects, exog, count_ind, method, model, params):
         exog0[:, i] += 2
         effect1 = model.predict(params, exog0)
         #NOTE: done by analogy with dummy effects but untested bc
-        # stata doesn't handle both count and eydx anywhere
+        # stata does not handle both count and eydx anywhere
         if 'ey' in method:
             effect0 = np.log(effect0)
             effect1 = np.log(effect1)
@@ -309,7 +309,7 @@ def margeff_cov_params(model, params, exog, cov_params, at, derivative,
         try:
             jacobian_mat = approx_fprime_cs(params, derivative,
                                             args=(exog,method))
-        except TypeError:  # norm.cdf doesn't take complex values
+        except TypeError:  # norm.cdf does not take complex values
             from statsmodels.tools.numdiff import approx_fprime
             jacobian_mat = approx_fprime(params, derivative,
                                             args=(exog,method))
@@ -326,7 +326,7 @@ def margeff_cov_params(model, params, exog, cov_params, at, derivative,
     else:
         jacobian_mat = derivative
 
-    #NOTE: this won't go through for at == 'all'
+    #NOTE: this will not go through for at == 'all'
     return np.dot(np.dot(jacobian_mat, cov_params), jacobian_mat.T)
 
 def margeff_cov_with_se(model, params, exog, cov_params, at, derivative,
@@ -725,7 +725,7 @@ def get_margeff(self, at='overall', method='dydx', atexog=None,
                                                                   order='F')
                 self.margeff_cov = margeff_cov[effects_idx][:, effects_idx]
             else:
-                # don't care about at constant
+                # do not care about at constant
                 # hack truncate effects_idx again if necessary
                 # if eyex, then effects is truncated to be without extra params
                 effects_idx = effects_idx[:len(effects)]
diff --git a/statsmodels/discrete/discrete_model.py b/statsmodels/discrete/discrete_model.py
index 4092ed64cc5..e03e746c300 100644
--- a/statsmodels/discrete/discrete_model.py
+++ b/statsmodels/discrete/discrete_model.py
@@ -264,7 +264,7 @@ def fit_regularized(self, start_params=None, method='l1',
         auto_trim_tol : float
             For sue when trim_mode == 'auto'.  Use
         qc_tol : float
-            Print warning and don't allow auto trim when (ii) (above) is
+            Print warning and do not allow auto trim when (ii) (above) is
             violated by this much.
         qc_verbose : Boolean
             If true, print out a full QC report upon failure
@@ -587,7 +587,7 @@ def predict(self, params, exog=None, linear=False):
         Column 0 is the base case, the rest conform to the rows of params
         shifted up one for the base case.
         """
-        if exog is None: # do here to accomodate user-given exog
+        if exog is None: # do here to accommodate user-given exog
             exog = self.exog
         if exog.ndim == 1:
             exog = exog[None]
@@ -2492,7 +2492,7 @@ class NegativeBinomial(CountModel):
 
     References
     ----------
-    Greene, W. 2008. "Functional forms for the negtive binomial model
+    Greene, W. 2008. "Functional forms for the negative binomial model
         for count data". Economics Letters. Volume 99, Number 3, pp.585-590.
     Hilbe, J.M. 2011. "Negative binomial regression". Cambridge University
         Press.
@@ -2526,7 +2526,7 @@ def __init__(self, endog, exog, loglike_method='nb2', offset=None,
         else:
             self.k_extra = 0
         # store keys for extras if we need to recreate model instance
-        # we need to append keys that don't go to super
+        # we need to append keys that do not go to super
         self._init_keys.append('loglike_method')
 
     def _initialize(self):
@@ -2840,7 +2840,7 @@ def fit(self, start_params=None, method='bfgs', maxiter=35,
             full_output=1, disp=1, callback=None,
             cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs):
 
-        # Note: don't let super handle robust covariance because it has
+        # Note: do not let super handle robust covariance because it has
         # transformed params
         self._transparams = False # always define attribute
         if self.loglike_method.startswith('nb') and method not in ['newton',
@@ -2882,7 +2882,7 @@ def fit(self, start_params=None, method='bfgs', maxiter=35,
                         # TODO: Fix NBin _check_perfect_pred
         if self.loglike_method.startswith('nb'):
             # mlefit is a wrapped counts results
-            self._transparams = False # don't need to transform anymore now
+            self._transparams = False # do not need to transform anymore now
             # change from lnalpha to alpha
             if method not in ["newton", "ncg"]:
                 mlefit._results.params[-1] = np.exp(mlefit._results.params[-1])
@@ -2908,7 +2908,7 @@ def fit_regularized(self, start_params=None, method='l1',
 
         if self.loglike_method.startswith('nb') and (np.size(alpha) == 1 and
                                                      alpha != 0):
-            # don't penalize alpha if alpha is scalar
+            # do not penalize alpha if alpha is scalar
             k_params = self.exog.shape[1] + self.k_extra
             alpha = alpha * np.ones(k_params)
             alpha[-1] = 0
@@ -3385,7 +3385,7 @@ def __init__(self, model, mlefit, cov_type='nonrobust', cov_kwds=None,
         self.__dict__.update(mlefit.__dict__)
 
         if not hasattr(self, 'cov_type'):
-            # do this only if super, i.e. mlefit didn't already add cov_type
+            # do this only if super, i.e. mlefit did not already add cov_type
             # robust covariance
             if use_t is not None:
                 self.use_t = use_t
diff --git a/statsmodels/discrete/tests/results/results_discrete.py b/statsmodels/discrete/tests/results/results_discrete.py
index d7d1dd77c43..6ef608034e1 100644
--- a/statsmodels/discrete/tests/results/results_discrete.py
+++ b/statsmodels/discrete/tests/results/results_discrete.py
@@ -684,8 +684,8 @@ def logit():
         # mata
         # V = st_matrix("e(margeff_V)")
         # se = diagonal(cholesky(diag(V)))
-        # last SE taken from margins with i.psi, don't know how they
-        # don't know why margeff is different, but trust official results
+        # last SE taken from margins with i.psi, do not know how they
+        # do not know why margeff is different, but trust official results
         obj.margeff_count_dummy_dydxoverall_se = [.1094379569, .0177869773,
                                                   .1420034]
 
@@ -788,7 +788,7 @@ def probit():
             -.3575735, .447951, -.7988633, -1.939208, .6021435,
             1.196623, .9407793, -.8927477, .59048, .3128364,
             -1.246147, 2.045071]
-        # Stata doesn't have it, but I think it's just oversight
+        # Stata does not have it, but I think it's just oversight
         obj.resid_pearson = None
         # generalized residuals from gretl
         obj.resid_generalized = [
@@ -1000,7 +1000,7 @@ def negativebinomial_nb1_bfgs():
             # From R, this is alpha/bse(alpha)
             59.2190796881069
 
-            # taken from Stata even though they don't report it
+            # taken from Stata even though they do not report it
             # lnalpha/bse(lnalpha)
             # 77.968995
         ]
diff --git a/statsmodels/discrete/tests/test_constrained.py b/statsmodels/discrete/tests/test_constrained.py
index 92599e1ad04..91c3b742992 100644
--- a/statsmodels/discrete/tests/test_constrained.py
+++ b/statsmodels/discrete/tests/test_constrained.py
@@ -98,7 +98,7 @@ def test_basic_method(self):
             # other
             assert_allclose(res1.llf, res2.ll, rtol=1e-6)
             assert_equal(res1.df_model, res2.df_m)
-            # Stata doesn't have df_resid
+            # Stata does not have df_resid
             df_r = res2.N - res2.df_m - 1
             assert_equal(res1.df_resid, df_r)
         else:
diff --git a/statsmodels/discrete/tests/test_count_model.py b/statsmodels/discrete/tests/test_count_model.py
index 9bbfafff56b..48c42d8fdfa 100644
--- a/statsmodels/discrete/tests/test_count_model.py
+++ b/statsmodels/discrete/tests/test_count_model.py
@@ -170,7 +170,7 @@ def setup_class(cls):
         cls.data = data
         exog = sm.add_constant(data.exog.iloc[:,1:4], prepend=False)
         exog_infl = sm.add_constant(data.exog.iloc[:,0], prepend=False)
-        # we don't need to verify convergence here
+        # we do not need to verify convergence here
         start_params = np.asarray([0.10337834587498942, -1.0459825102508549,
                                    -0.08219794475894268, 0.00856917434709146,
                                    -0.026795737379474334, 1.4823632430107334])
@@ -460,7 +460,7 @@ def test_predict_prob(self):
         assert_allclose(((prm - freq)**2).mean(), 0, rtol=1e-10, atol=1e-4)
 
     def test_predict_generic_zi(self):
-        # These tests don't use numbers from other packages.
+        # These tests do not use numbers from other packages.
         # Tests are on closeness of estimated to true/DGP values
         # and theoretical relationship between quantities
         res = self.res
diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py
index cab550c6885..c1cb578891a 100644
--- a/statsmodels/discrete/tests/test_discrete.py
+++ b/statsmodels/discrete/tests/test_discrete.py
@@ -704,7 +704,7 @@ def setup_class(cls):
         # Do a regularized fit with alpha, effectively dropping the last column
         alpha = 10 * len(rand_data.endog) * np.ones(cls.kvars + 1)
         alpha[:cls.m] = 0
-        alpha[-1] = 0  # don't penalize alpha
+        alpha[-1] = 0  # do not penalize alpha
 
         mod_reg = sm.NegativeBinomial(rand_data.endog, rand_exog)
         cls.res_reg = mod_reg.fit_regularized(
@@ -1367,7 +1367,7 @@ def test_pred_table(self):
         # the rows should add up for pred table
         assert_array_equal(self.res1.pred_table().sum(0), np.bincount(pred))
 
-        # note this is just a regression test, gretl doesn't have a prediction
+        # note this is just a regression test, gretl does not have a prediction
         # table
         pred = [[ 126.,   41.,    2.,    0.,    0.,   12.,   19.],
                 [  77.,   73.,    3.,    0.,    0.,   15.,   12.],
@@ -1460,7 +1460,7 @@ def test_poisson_predict():
 
 
 def test_poisson_newton():
-    #GH: 24, Newton doesn't work well sometimes
+    #GH: 24, Newton does not work well sometimes
     nobs = 10000
     np.random.seed(987689)
     x = np.random.randn(nobs, 3)
@@ -1752,7 +1752,7 @@ def test_t(self):
     def test_fit_regularized(self):
         model = self.res1.model
 
-        # don't penalize constant and dispersion parameter
+        # do not penalize constant and dispersion parameter
         alpha = np.ones(len(self.res1.params))
         alpha[-2:] = 0
         # the first prints currently a warning, irrelevant here
@@ -2064,7 +2064,7 @@ def setup_class(cls):
         # Do a regularized fit with alpha, effectively dropping the last column
         alpha = 10 * len(rand_data.endog) * np.ones(cls.kvars + 1)
         alpha[:cls.m] = 0
-        alpha[-1] = 0  # don't penalize alpha
+        alpha[-1] = 0  # do not penalize alpha
 
         mod_reg = sm.NegativeBinomialP(rand_data.endog, rand_exog)
         cls.res_reg = mod_reg.fit_regularized(
@@ -2143,7 +2143,7 @@ def test_llnull(self):
 
         res_null1 = self.res_null
         assert_allclose(llf0, res_null1.llf, rtol=1e-6)
-        # Note default convergence tolerance doesn't get lower rtol
+        # Note default convergence tolerance does not get lower rtol
         # from different starting values (using bfgs)
         assert_allclose(res_null0.params, res_null1.params, rtol=5e-5)
 
@@ -2351,7 +2351,7 @@ def test_unchanging_degrees_of_freedom():
     # If res2.df_model == res1.df_model, then this test is invalid.
 
     res3 = model.fit(start_params=params, disp=0)
-    # Test that the call to `fit_regularized` didn't
+    # Test that the call to `fit_regularized` did not
     # modify model.df_model inplace.
     assert_equal(res3.df_model, res1.df_model)
     assert_equal(res3.df_resid, res1.df_resid)
diff --git a/statsmodels/discrete/tests/test_margins.py b/statsmodels/discrete/tests/test_margins.py
index cc87a8f4852..f72e77ad679 100644
--- a/statsmodels/discrete/tests/test_margins.py
+++ b/statsmodels/discrete/tests/test_margins.py
@@ -39,7 +39,7 @@ class TestPoissonMargin(CheckMarginMixin):
 
     @classmethod
     def setup_class(cls):
-        # here we don't need to check convergence from default start_params
+        # here we do not need to check convergence from default start_params
         start_params = [14.1709, 0.7085, -3.4548, -0.539, 3.2368,  -7.9299,
                         -5.0529]
         mod_poi = Poisson(endog, exog)
@@ -58,7 +58,7 @@ class TestPoissonMarginDummy(CheckMarginMixin):
 
     @classmethod
     def setup_class(cls):
-        # here we don't need to check convergence from default start_params
+        # here we do not need to check convergence from default start_params
         start_params = [14.1709, 0.7085, -3.4548, -0.539, 3.2368,  -7.9299,
                         -5.0529]
         mod_poi = Poisson(endog, exog)
@@ -75,7 +75,7 @@ class TestNegBinMargin(CheckMarginMixin):
 
     @classmethod
     def setup_class(cls):
-        # here we don't need to check convergence from default start_params
+        # here we do not need to check convergence from default start_params
         start_params = [13.1996, 0.8582, -2.8005, -1.5031, 2.3849, -8.5552,
                         -2.88, 1.14]
         mod = NegativeBinomial(endog, exog)
@@ -94,7 +94,7 @@ class TestNegBinMarginDummy(CheckMarginMixin):
 
     @classmethod
     def setup_class(cls):
-        # here we don't need to check convergence from default start_params
+        # here we do not need to check convergence from default start_params
         start_params = [13.1996, 0.8582, -2.8005, -1.5031, 2.3849, -8.5552,
                         -2.88, 1.14]
         mod = NegativeBinomial(endog, exog)
@@ -113,7 +113,7 @@ class TestNegBinPMargin(CheckMarginMixin):
 
     @classmethod
     def setup_class(cls):
-        # here we don't need to check convergence from default start_params
+        # here we do not need to check convergence from default start_params
         start_params = [13.1996, 0.8582, -2.8005, -1.5031, 2.3849, -8.5552,
                         -2.88, 1.14]
         mod = NegativeBinomialP(endog, exog)   # checks also that default p=2
diff --git a/statsmodels/discrete/tests/test_sandwich_cov.py b/statsmodels/discrete/tests/test_sandwich_cov.py
index e3e7d1df07c..8675607b876 100644
--- a/statsmodels/discrete/tests/test_sandwich_cov.py
+++ b/statsmodels/discrete/tests/test_sandwich_cov.py
@@ -33,7 +33,7 @@
 data = data_raw.dropna()
 
 #mod = smd.Poisson.from_formula('accident ~ yr_con + op_75_79', data=dat)
-# Don't use formula for tests against Stata because intercept needs to be last
+# Do not use formula for tests against Stata because intercept needs to be last
 endog = data['accident']
 exog_data = data['yr_con op_75_79'.split()]
 exog = add_constant(exog_data, prepend=False)
@@ -484,7 +484,7 @@ def setup_class(cls):
     def test_score_hessian(self):
         res1 = self.res1
         res2 = self.res2
-        # Note scale is fixed at 1, so we don't need to fix it explicitly
+        # Note scale is fixed at 1, so we do not need to fix it explicitly
         score1 = res1.model.score(res1.params * 0.98)
         score2 = res2.model.score(res1.params * 0.98)
         assert_allclose(score1, score2, rtol=1e-13)
diff --git a/statsmodels/duration/hazard_regression.py b/statsmodels/duration/hazard_regression.py
index 6616823306b..9c6f642b053 100644
--- a/statsmodels/duration/hazard_regression.py
+++ b/statsmodels/duration/hazard_regression.py
@@ -1187,7 +1187,7 @@ class bunch:
             standard_errors = None
         ret_val = bunch()
 
-        # Don't do anything with offset here because we want to allow
+        # Do not do anything with offset here because we want to allow
         # different offsets to be specified even if exog is the model
         # exog.
         exog_provided = True
@@ -1202,7 +1202,7 @@ class bunch:
         elif self.offset is not None and not exog_provided:
             lhr += self.offset
 
-        # Handle lhr and hr prediction first, since they don't make
+        # Handle lhr and hr prediction first, since they do not make
         # use of the hazard function.
 
         if pred_type == "lhr":
@@ -1408,7 +1408,7 @@ def get_distribution(self):
         -----
         The distributions are obtained from a simple discrete estimate
         of the survivor function that puts all mass on the observed
-        failure times wihtin a stratum.
+        failure times within a stratum.
         """
 
         return self.model.get_distribution(self.params)
diff --git a/statsmodels/duration/survfunc.py b/statsmodels/duration/survfunc.py
index 990025fcbe8..22b785a1590 100644
--- a/statsmodels/duration/survfunc.py
+++ b/statsmodels/duration/survfunc.py
@@ -381,7 +381,7 @@ def plot(self, ax=None):
         >>> li[0].set_color('purple')
         >>> li[1].set_color('purple')
 
-        Don't show the censoring points:
+        Do not show the censoring points:
 
         >>> fig = sf.plot()
         >>> ax = fig.get_axes()[0]
diff --git a/statsmodels/duration/tests/test_phreg.py b/statsmodels/duration/tests/test_phreg.py
index 13011ec26d2..a2f368b0e17 100644
--- a/statsmodels/duration/tests/test_phreg.py
+++ b/statsmodels/duration/tests/test_phreg.py
@@ -382,7 +382,7 @@ def test_fit_regularized(self):
                 model = PHReg(time, exog, status=status, ties='breslow')
                 sm_result = model.fit_regularized(alpha=s)
 
-                # The agreement isn't very high, the issue may be on
+                # The agreement is not very high, the issue may be on
                 # the R side.  See below for further checks.
                 assert_allclose(sm_result.params, params, rtol=0.3)
 
diff --git a/statsmodels/emplike/descriptive.py b/statsmodels/emplike/descriptive.py
index 5ab59276e34..5886af19424 100644
--- a/statsmodels/emplike/descriptive.py
+++ b/statsmodels/emplike/descriptive.py
@@ -498,7 +498,7 @@ def test_mean(self, mu0, return_weights=False):
             Mean value to be tested
 
         return_weights : bool
-            If return_weights is True the funtion returns
+            If return_weights is True the function returns
             the weights of the observations under the null hypothesis.
             Default is False
 
@@ -538,7 +538,7 @@ def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8,
             Lagrange (see Owen pg 22) and then determine the weights.
 
             'nested brent' uses brents method to find the confidence
-            intervals but must maximize the likelihhod ratio on every
+            intervals but must maximize the likelihood ratio on every
             iteration.
 
             gamma is generally much faster.  If the optimizations does not
@@ -565,7 +565,7 @@ def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8,
 
             When using 'gamma', amount to decrease (increase) the
             minimum (maximum) by to start the search for gamma.
-            If function returns f(a) and f(b) must have differnt signs,
+            If function returns f(a) and f(b) must have different signs,
             consider lowering epsilon.
 
         Returns
diff --git a/statsmodels/examples/ex_feasible_gls_het.py b/statsmodels/examples/ex_feasible_gls_het.py
index a0ae7b5b670..90517160cb7 100644
--- a/statsmodels/examples/ex_feasible_gls_het.py
+++ b/statsmodels/examples/ex_feasible_gls_het.py
@@ -14,8 +14,8 @@
 
 Author: Josef Perktold
 
-There might be something fishy with the example, but I don't see it.
-Or maybe it's supposed to be this way because in the first case I don't
+There might be something fishy with the example, but I do not see it.
+Or maybe it's supposed to be this way because in the first case I do not
 include a constant and in the second case I include some of the same
 regressors as in the main equation.
 
diff --git a/statsmodels/examples/ex_generic_mle.py b/statsmodels/examples/ex_generic_mle.py
index 41062988ba3..667906a2b80 100644
--- a/statsmodels/examples/ex_generic_mle.py
+++ b/statsmodels/examples/ex_generic_mle.py
@@ -40,7 +40,7 @@ def probitloglike(params, endog, exog):
 #datal = sm.datasets.longley.load(as_pandas=False)
 datal = sm.datasets.ccard.load(as_pandas=False)
 datal.exog = sm.add_constant(datal.exog, prepend=False)
-# Instance of GenericLikelihood model doesn't work directly, because loglike
+# Instance of GenericLikelihood model does not work directly, because loglike
 # cannot get access to data in self.endog, self.exog
 
 nobs = 5000
@@ -114,7 +114,7 @@ def loglikeobs(self, params):
 res_norm3.model.score(res_norm3.params)
 
 #fprime in fit option cannot be overwritten, set to None, when score is defined
-# exception is fixed, but I don't think score was supposed to be called
+# exception is fixed, but I do not think score was supposed to be called
 
 res_bfgs = mod_norm2.fit(start_params=start_params, method="bfgs", fprime=None,
                          maxiter=500, retall=0)
diff --git a/statsmodels/examples/ex_generic_mle_tdist.py b/statsmodels/examples/ex_generic_mle_tdist.py
index 022642d8e7c..4959b811bae 100644
--- a/statsmodels/examples/ex_generic_mle_tdist.py
+++ b/statsmodels/examples/ex_generic_mle_tdist.py
@@ -14,7 +14,7 @@
 #import for kstest based estimation
 #should be replace
 # FIXME: importing these patches scipy distribution classes in-place.
-#  Don't do this.
+#  Do not do this.
 import statsmodels.sandbox.distributions.sppatch  # noqa:F401
 
 
@@ -155,7 +155,7 @@ def nloglikeobs(self, params):
 
 
 ##################### Example: Pareto
-# estimating scale doesn't work yet, a bug somewhere ?
+# estimating scale does not work yet, a bug somewhere ?
 # fit_ks works well, but no bse or other result statistics yet
 
 
@@ -195,7 +195,7 @@ def fit_ks(self):
         '''fit Pareto with nested optimization
 
         originally published on stackoverflow
-        this doesn't trim lower values during ks optimization
+        this does not trim lower values during ks optimization
 
         '''
         rvs = self.endog
@@ -311,7 +311,7 @@ def pareto_ks(loc, rvs):
 
 print(res_par.params[1:].sum(), sum(res_parks[1:]), mod_par.endog.min())
 
-#start new model, so we don't get two result instances with the same model instance
+#start new model, so we do not get two result instances with the same model instance
 mod_par = MyPareto(y)
 mod_par.fixed_params = fixdf
 mod_par.fixed_paramsmask = np.isnan(fixdf)
diff --git a/statsmodels/examples/ex_rootfinding.py b/statsmodels/examples/ex_rootfinding.py
index ba79cb5fa23..3eebf1d02e1 100644
--- a/statsmodels/examples/ex_rootfinding.py
+++ b/statsmodels/examples/ex_rootfinding.py
@@ -85,7 +85,7 @@ def func2(x, a):
     raise ValueError('start_upp needs to be positive')
     -499.999996336
     '''
-    ''' this doesn't work
+    ''' this does not work
     >>> print(brentq_expanding(func, args=(-500,), start_upp=-1000)
     raise ValueError('start_upp needs to be positive')
     OverflowError: (34, 'Result too large')
diff --git a/statsmodels/examples/ex_sandwich2.py b/statsmodels/examples/ex_sandwich2.py
index 3de7a9a7188..224bca7ee8c 100644
--- a/statsmodels/examples/ex_sandwich2.py
+++ b/statsmodels/examples/ex_sandwich2.py
@@ -28,7 +28,7 @@
 #    #does currently not cache file
 
 y = srs['api00']
-#older numpy don't reorder
+#older numpy do not reorder
 #x = srs[['growth', 'emer', 'yr_rnd']].view(float).reshape(len(y), -1)
 #force sequence
 x = np.column_stack([srs[ii] for ii in ['growth', 'emer', 'yr_rnd']])
diff --git a/statsmodels/examples/example_rpy.py b/statsmodels/examples/example_rpy.py
index b3854ed903b..bc3c9601c0e 100644
--- a/statsmodels/examples/example_rpy.py
+++ b/statsmodels/examples/example_rpy.py
@@ -4,7 +4,7 @@
 
 # example 1: OLS using LM
 # example 2: GLM with binomial family
-    The second results isn't exactly correct since it assumes that each
+    The second results is not exactly correct since it assumes that each
     obvervation has the same number of trials see datasets/longley for an R script
     with the correct syntax.
 
diff --git a/statsmodels/examples/l1_demo/demo.py b/statsmodels/examples/l1_demo/demo.py
index cf5b258b08c..ee399d24cdb 100644
--- a/statsmodels/examples/l1_demo/demo.py
+++ b/statsmodels/examples/l1_demo/demo.py
@@ -164,7 +164,7 @@ def run_demo(mode, base_alpha=0.01, N=500, get_l1_slsqp_results=False,
     # Here we scale it with N for simplicity.  In practice, you should
     # use cross validation to pick alpha
     alpha = base_alpha * N * sp.ones((num_nonconst_covariates+1, num_targets-1))
-    alpha[0,:] = 0  # Don't regularize the intercept
+    alpha[0,:] = 0  # Do not regularize the intercept
 
     #### Make the data and model
     exog = get_exog(N, num_nonconst_covariates, cor_length)
diff --git a/statsmodels/examples/l1_demo/short_demo.py b/statsmodels/examples/l1_demo/short_demo.py
index ec252a9e879..83100a2606d 100644
--- a/statsmodels/examples/l1_demo/short_demo.py
+++ b/statsmodels/examples/l1_demo/short_demo.py
@@ -63,7 +63,7 @@
 ## Set the regularization parameter.
 alpha = 10 * np.ones((mlogit_mod.J - 1, mlogit_mod.K))
 
-# Don't regularize the constant
+# Do not regularize the constant
 alpha[-1,:] = 0
 mlogit_l1_res = mlogit_mod.fit_regularized(method='l1', alpha=alpha)
 print(mlogit_l1_res.params)
@@ -93,9 +93,9 @@
 alphas = 1 / np.logspace(-0.5, 2, N)
 
 ## Sweep alpha and store the coefficients
-# QC check doesn't always pass with the default options.
+# QC check does not always pass with the default options.
 # Use the options QC_verbose=True and disp=True
-# to to see what is happening.  It just barely doesn't pass, so I decreased
+# to to see what is happening.  It just barely does not pass, so I decreased
 # acc and increased QC_tol to make it pass
 for n, alpha in enumerate(alphas):
     logit_res = logit_mod.fit_regularized(
diff --git a/statsmodels/examples/l1_demo/sklearn_compare.py b/statsmodels/examples/l1_demo/sklearn_compare.py
index 04569fb377f..98f1b096afd 100644
--- a/statsmodels/examples/l1_demo/sklearn_compare.py
+++ b/statsmodels/examples/l1_demo/sklearn_compare.py
@@ -14,7 +14,7 @@
 
 The results "prove" that the regularization paths are the same.  Note that
     finding the reparameterization is non-trivial since the coefficient paths
-    are NOT monotonic.  As a result, the paths don't match up perfectly.
+    are NOT monotonic.  As a result, the paths do not match up perfectly.
 """
 from statsmodels.compat.python import lrange
 from sklearn import linear_model
diff --git a/statsmodels/examples/run_all.py b/statsmodels/examples/run_all.py
index 9f89368afdb..90a22094a56 100644
--- a/statsmodels/examples/run_all.py
+++ b/statsmodels/examples/run_all.py
@@ -1,4 +1,4 @@
-'''run all examples to make sure we don't get an exception
+'''run all examples to make sure we do not get an exception
 
 Note:
 If an example contaings plt.show(), then all plot windows have to be closed
diff --git a/statsmodels/examples/try_fit_constrained.py b/statsmodels/examples/try_fit_constrained.py
index dcfb89638ee..c0fc094c6f3 100644
--- a/statsmodels/examples/try_fit_constrained.py
+++ b/statsmodels/examples/try_fit_constrained.py
@@ -66,7 +66,7 @@
     #tri2 = TransformRestriction(Ri2, q)
     #p = tri.expand([1,1])
     assert_raises(ValueError, TransformRestriction, Ri2, q)
-    # L doesn't have full row rank, calculating constant fails with Singular Matrix
+    # L does not have full row rank, calculating constant fails with Singular Matrix
 
     # transform data xr = T x
     np.random.seed(1)
@@ -74,7 +74,7 @@
     xr = tr1.reduce(x)
     # roundtrip
     x2 = tr1.expand(xr)
-    # this doesn't hold ? don't use constant? don't need it anyway ?
+    # this does not hold ? do not use constant? do not need it anyway ?
     #assert_allclose(x2, x, rtol=1e-14)
 
 
@@ -119,7 +119,7 @@
     print(params)
     print(res3_ols.params)
     print(res3_ols.bse)
-    # the following raises `ValueError: can't test a constant constraint`
+    # the following raises `ValueError: cannot test a constant constraint`
     #tt = res3.t_test(transf3.transf_mat, transf3.constant.squeeze())
     #print tt.sd
     cov_params3 = transf3.transf_mat.dot(res3.cov_params()).dot(transf3.transf_mat.T)
@@ -155,7 +155,7 @@
     print('\nPoisson')
     print(paramsp)
     print(poisson_res.params)
-    # error because I don't use the unconstrained basic model
+    # error because I do not use the unconstrained basic model
 #    tp = transform_params_constraint(poisson_res.params, poisson_res.cov_params(), transfp.R, transfp.q)
 #    cov_params3 = transf3.transf_mat.dot(res3.cov_params()).dot(transf3.transf_mat.T)
 #    bse3 = np.sqrt(np.diag(cov_params3))
diff --git a/statsmodels/examples/tsa/ex_arma.py b/statsmodels/examples/tsa/ex_arma.py
index 77f00a46fb9..811d5b2b35b 100644
--- a/statsmodels/examples/tsa/ex_arma.py
+++ b/statsmodels/examples/tsa/ex_arma.py
@@ -1,6 +1,6 @@
 '''
 
-doesn't seem to work so well anymore even with nobs=1000 ???
+does not seem to work so well anymore even with nobs=1000 ???
 works ok if noise variance is large
 '''
 
diff --git a/statsmodels/gam/generalized_additive_model.py b/statsmodels/gam/generalized_additive_model.py
index a55ecb826b5..175eec33e02 100644
--- a/statsmodels/gam/generalized_additive_model.py
+++ b/statsmodels/gam/generalized_additive_model.py
@@ -357,7 +357,7 @@ def plot_partial(self, smooth_index, plot_se=True, cpr=False,
             ax.plot(x, y_est + 1.96 * se, '-', c='blue')
             ax.plot(x, y_est - 1.96 * se, '-', c='blue')
         if cpr:
-            # TODO: resid_response doesn't make sense with nonlinear link
+            # TODO: resid_response does not make sense with nonlinear link
             # use resid_working ?
             cpr_ = y_est + self.resid_working
             ax.plot(x, cpr_, '.', lw=2)
@@ -729,7 +729,7 @@ def _fit_pirls(self, alpha, start_params=None, maxiter=100, tol=1e-8,
             lin_pred += self._offset_exposure
             mu = self.family.fitted(lin_pred)
 
-            # We don't need to update scale in GLM/LEF models
+            # We do not need to update scale in GLM/LEF models
             # We might need it in dispersion models.
             # self.scale = self.estimate_scale(mu)
             history = self._update_history(wls_results, mu, history)
@@ -968,7 +968,7 @@ def penalized_wls(endog, exog, penalty_matrix, weights):
     results : Results instance of WLS
     """
     y, x, s = endog, exog, penalty_matrix
-    # TODO: I don't understand why I need 2 * s
+    # TODO: I do not understand why I need 2 * s
     aug_y, aug_x, aug_weights = make_augmented_matrix(y, x, 2 * s, weights)
     wls_results = lm.WLS(aug_y, aug_x, aug_weights).fit()
     # TODO: use MinimalWLS during iterations, less overhead
diff --git a/statsmodels/gam/smooth_basis.py b/statsmodels/gam/smooth_basis.py
index a608a99cdbf..ffcc9ae3c25 100644
--- a/statsmodels/gam/smooth_basis.py
+++ b/statsmodels/gam/smooth_basis.py
@@ -58,7 +58,7 @@ def _eval_bspline_basis(x, knots, degree, deriv='all', include_intercept=True):
         x = x[:, 0]
     assert x.ndim == 1
     # XX FIXME: when points fall outside of the boundaries, splev and R seem
-    # to handle them differently. I don't know why yet. So until we understand
+    # to handle them differently. I do not know why yet. So until we understand
     # this and decide what to do with it, I'm going to play it safe and
     # disallow such points.
     if np.min(x) < np.min(knots) or np.max(x) > np.max(knots):
@@ -68,11 +68,11 @@ def _eval_bspline_basis(x, knots, degree, deriv='all', include_intercept=True):
     # Thanks to Charles Harris for explaining splev. It's not well
     # documented, but basically it computes an arbitrary b-spline basis
     # given knots and degree on some specificed points (or derivatives
-    # thereof, but we don't use that functionality), and then returns some
+    # thereof, but we do not use that functionality), and then returns some
     # linear combination of these basis functions. To get out the basis
     # functions themselves, we use linear combinations like [1, 0, 0], [0,
     # 1, 0], [0, 0, 1].
-    # NB: This probably makes it rather inefficient (though I haven't checked
+    # NB: This probably makes it rather inefficient (though I have not checked
     # to be sure -- maybe the fortran code actually skips computing the basis
     # function for coefficients that are zero).
     # Note: the order of a spline is the same as its degree + 1.
diff --git a/statsmodels/gam/tests/test_gam.py b/statsmodels/gam/tests/test_gam.py
index e8aaef0377a..830768afdf0 100644
--- a/statsmodels/gam/tests/test_gam.py
+++ b/statsmodels/gam/tests/test_gam.py
@@ -337,7 +337,7 @@ def test_multivariate_gam_1d_data():
 
 def test_multivariate_gam_cv():
     # SMOKE test
-    # no test is performed. It only checks that there isn't any runtime error
+    # no test is performed. It only checks that there is not any runtime error
 
     def cost(x1, x2):
         return np.linalg.norm(x1 - x2) / len(x1)
@@ -703,7 +703,7 @@ def test_partial_values2():
 
 
 def test_partial_values():
-    # this test is only approximate because we don't use the same spline
+    # this test is only approximate because we do not use the same spline
     # basis functions (knots) as mgcv
     cur_dir = os.path.dirname(os.path.abspath(__file__))
     file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
diff --git a/statsmodels/genmod/_prediction.py b/statsmodels/genmod/_prediction.py
index 0ebd390da7b..5a8efbbcf0c 100644
--- a/statsmodels/genmod/_prediction.py
+++ b/statsmodels/genmod/_prediction.py
@@ -134,8 +134,8 @@ def summary_frame(self, what='all', alpha=0.05):
         to_include['mean_ci_upper'] = ci_mean[:, 1]
 
         self.table = to_include
-        #OrderedDict doesn't work to preserve sequence
-        # pandas dict doesn't handle 2d_array
+        #OrderedDict does not work to preserve sequence
+        # pandas dict does not handle 2d_array
         #data = np.column_stack(list(to_include.values()))
         #names = ....
         res = pd.DataFrame(to_include, index=self.row_labels,
diff --git a/statsmodels/genmod/bayes_mixed_glm.py b/statsmodels/genmod/bayes_mixed_glm.py
index 12b609f11f3..7bc894c2d53 100644
--- a/statsmodels/genmod/bayes_mixed_glm.py
+++ b/statsmodels/genmod/bayes_mixed_glm.py
@@ -767,12 +767,12 @@ def fit_vb(self,
             # caller)
             s = np.log(sd)
 
-        # Don't allow the variance parameter starting mean values to
+        # Do not allow the variance parameter starting mean values to
         # be too small.
         i1, i2 = self.k_fep, self.k_fep + self.k_vcp
         m[i1:i2] = np.where(m[i1:i2] < -1, -1, m[i1:i2])
 
-        # Don't allow the posterior standard deviation starting values
+        # Do not allow the posterior standard deviation starting values
         # to be too small.
         s = np.where(s < -1, -1, s)
 
diff --git a/statsmodels/genmod/cov_struct.py b/statsmodels/genmod/cov_struct.py
index 73fdd696413..18d8e434933 100644
--- a/statsmodels/genmod/cov_struct.py
+++ b/statsmodels/genmod/cov_struct.py
@@ -167,7 +167,7 @@ def covariance_matrix_solve(self, expval, index, stdev, rhs):
 
         self.cov_adjust.append(cov_adjust)
 
-        # Last resort if we still can't factor the covariance matrix.
+        # Last resort if we still cannot factor the covariance matrix.
         if not success:
             warnings.warn(
                 "Unable to condition covariance matrix to an SPD "
@@ -675,7 +675,7 @@ class Autoregressive(CovStruct):
 
     The autocorrelation parameter is estimated using weighted
     nonlinear least squares, regressing each value within a cluster on
-    each preceeding value in the same cluster.
+    each preceding value in the same cluster.
 
     Parameters
     ----------
@@ -1289,7 +1289,7 @@ def _make_pairs(self, i, j):
             bmat = np.ascontiguousarray(mat).view(dtype)
             _, idx = np.unique(bmat, return_index=True)
         except TypeError:
-            # workaround for old numpy that can't call unique with complex
+            # workaround for old numpy that cannot call unique with complex
             # dtypes
             rs = np.random.RandomState(4234)
             bmat = np.dot(mat, rs.uniform(size=mat.shape[1]))
diff --git a/statsmodels/genmod/families/family.py b/statsmodels/genmod/families/family.py
index bc35a9442be..d587fa06792 100644
--- a/statsmodels/genmod/families/family.py
+++ b/statsmodels/genmod/families/family.py
@@ -1033,7 +1033,7 @@ class InverseGaussian(Family):
 
     Notes
     -----
-    The inverse Guassian distribution is sometimes referred to in the
+    The inverse Gaussian distribution is sometimes referred to in the
     literature as the Wald distribution.
 
     """
diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py
index 7ff9b9778d3..9722e199df0 100644
--- a/statsmodels/genmod/generalized_estimating_equations.py
+++ b/statsmodels/genmod/generalized_estimating_equations.py
@@ -231,7 +231,7 @@ def unpack_cov(self, bcov):
       Gaussian     |   x    x                        x
       inv Gaussian |   x    x                        x
       binomial     |   x    x    x     x       x     x    x           x      x
-      Poission     |   x    x                        x
+      Poisson     |   x    x                        x
       neg binomial |   x    x                        x          x
       gamma        |   x    x                        x
 
@@ -246,7 +246,7 @@ def unpack_cov(self, bcov):
     other packages.  The "naive" estimator gives smaller standard
     errors, but is only correct if the working correlation structure
     is correctly specified.  The "bias reduced" estimator of Mancl and
-    DeRouen (Biometrics, 2001) reduces the downard bias of the robust
+    DeRouen (Biometrics, 2001) reduces the downward bias of the robust
     estimator.
 
     The robust covariance provided here follows Liang and Zeger (1986)
@@ -1203,7 +1203,7 @@ def predict(self, params, exog=None, offset=None,
             exog = self.exog
 
             if not isinstance(self.family.link, families.links.Log):
-                # Don't need to worry about exposure
+                # Do not need to worry about exposure
                 if offset is None:
                     if self._offset_exposure is not None:
                         _offset = self._offset_exposure.copy()
@@ -1299,7 +1299,7 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None,
             self._fit_history['dep_params'].append(
                 self.cov_struct.dep_params)
 
-            # Don't exit until the association parameters have been
+            # Do not exit until the association parameters have been
             # updated at least once.
             if (del_params < ctol and
                     (num_assoc_updates > 0 or self.update_dep is False)):
@@ -1359,7 +1359,7 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None,
                         cov_robust_bc=bc_cov)
 
         # The superclass constructor will multiply the covariance
-        # matrix argument bcov by scale, which we don't want, so we
+        # matrix argument bcov by scale, which we do not want, so we
         # divide bcov by the scale parameter here
         results = GEEResults(self, mean_params, bcov / scale, scale,
                              cov_type=cov_type, use_t=False,
@@ -1774,7 +1774,7 @@ def __init__(self, model, params, cov_params, scale,
         attr_kwds = kwds.pop('attr_kwds', {})
         self.__dict__.update(attr_kwds)
 
-        # we don't do this if the cov_type has already been set
+        # we do not do this if the cov_type has already been set
         # subclasses can set it through attr_kwds
         if not (hasattr(self, 'cov_type') and
                 hasattr(self, 'cov_params_default')):
@@ -2036,7 +2036,7 @@ def conf_int(self, alpha=.05, cols=None, cov_type=None):
         -----
         The confidence interval is based on the Gaussian distribution.
         """
-        # super doesn't allow to specify cov_type and method is not
+        # super does not allow to specify cov_type and method is not
         # implemented,
         # FIXME: remove this method here
         if cov_type is None:
@@ -3250,7 +3250,7 @@ def get_margeff(self, at='overall', method='dydx', atexog=None,
                 model._derivative_exog, dummy_idx, count_idx,
                 method, 1)
 
-            # don't care about at constant
+            # do not care about at constant
             self.margeff_cov = margeff_cov[effects_idx][:, effects_idx]
             self.margeff_se = margeff_se[effects_idx]
             self.margeff = effects[effects_idx]
diff --git a/statsmodels/genmod/generalized_linear_model.py b/statsmodels/genmod/generalized_linear_model.py
index b42461f21b6..9216a9700f1 100644
--- a/statsmodels/genmod/generalized_linear_model.py
+++ b/statsmodels/genmod/generalized_linear_model.py
@@ -196,7 +196,7 @@ class GLM(base.LikelihoodModel):
      Gaussian      x     x   x     x      x       x   x     x      x
      inv Gaussian  x     x                        x
      binomial      x     x   x     x      x       x   x           x      x
-     Poission      x     x                        x
+     Poisson      x     x                        x
      neg binomial  x     x                        x        x
      gamma         x     x                        x
      Tweedie       x     x                        x
@@ -207,7 +207,7 @@ class GLM(base.LikelihoodModel):
     Endog and exog are references so that if the data they refer to are already
     arrays and these arrays are changed, endog and exog will change.
 
-    Statsmodels supports two separte definitions of weights: frequency weights
+    Statsmodels supports two separate definitions of weights: frequency weights
     and variance weights.
 
     Frequency weights produce the same results as repeating observations by the
@@ -1003,7 +1003,7 @@ def fit(self, start_params=None, maxiter=100, method='IRLS', tol=1e-8,
             'lstsq' and 'pinv' regularize the estimate in singular and
             near-singular cases by truncating small singular values based
             on `rcond` of the respective numpy.linalg function. 'qr' is
-            only valied for cases that are not singular nor near-singular.
+            only valid for cases that are not singular nor near-singular.
         optim_hessian : {'eim', 'oim'}, optional
             (available with scipy optimizer fits) When 'oim'--the default--the
             observed Hessian is used in fitting. 'eim' is the expected Hessian.
diff --git a/statsmodels/genmod/tests/results/poisson_weights_v2.do b/statsmodels/genmod/tests/results/poisson_weights_v2.do
index 31cc0a0b174..c11ead7086a 100644
--- a/statsmodels/genmod/tests/results/poisson_weights_v2.do
+++ b/statsmodels/genmod/tests/results/poisson_weights_v2.do
@@ -25,7 +25,7 @@ label variable var10 "fweight"
 rename var10 fweight
 label variable LN_VC100k96 "LN_VC100k96"
 
-/* for checkin Poisson produces the same, poisson doesn't allow aweights */
+/* for checkin Poisson produces the same, poisson does not allow aweights */
 /*poisson executions income perpoverty perblack LN_VC100k96 south degree */
 
 
@@ -151,7 +151,7 @@ estmat2nparray params_table cov infocrit predicted resids, saving(`filename') fo
 /*------------------*/
 
 /*******************************************************************/
-/***********  next with robust = HC1, don't save resid and similar */
+/***********  next with robust = HC1, do not save resid and similar */
 
 drop `pred' `res'
 glm executions income perpoverty perblack LN_VC100k96 south degree, family(poisson) vce(robust)
diff --git a/statsmodels/genmod/tests/results/results_glm.py b/statsmodels/genmod/tests/results/results_glm.py
index b97f578080f..75b1117d251 100644
--- a/statsmodels/genmod/tests/results/results_glm.py
+++ b/statsmodels/genmod/tests/results/results_glm.py
@@ -1176,7 +1176,7 @@ class InvGauss(object):
     #    eta = np.dot(X, params)
     #    mu = 1/np.sqrt(eta)
     #    sigma = .5
-    #   This isn't correct.  Errors need to be normally distributed
+    #   This is not correct.  Errors need to be normally distributed
     #   But Y needs to be Inverse Gaussian, so we could build it up
     #   by throwing out data?
     #   Refs:
diff --git a/statsmodels/genmod/tests/test_gee.py b/statsmodels/genmod/tests/test_gee.py
index 39f72061e8a..c7f3834e0c5 100644
--- a/statsmodels/genmod/tests/test_gee.py
+++ b/statsmodels/genmod/tests/test_gee.py
@@ -278,7 +278,7 @@ def test_poisson_epil(self):
                                 family=families.Poisson())
         rslt2 = mod2.fit()
 
-        # don't use wrapper, asserts_xxx don't work
+        # do not use wrapper, asserts_xxx do not work
         rslt1 = rslt1._results
         rslt2 = rslt2._results
 
@@ -494,7 +494,7 @@ def test_logistic(self):
             assert_almost_equal(mdf.standard_errors(), se[j],
                                 decimal=6)
 
-        # FIXME: don't leave commented-out
+        # FIXME: do not leave commented-out
         # Check for run-time exceptions in summary
         # print(mdf.summary())
 
@@ -1150,7 +1150,7 @@ def test_compare_OLS(self):
 
         ols = lm.OLS.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
 
-        # don't use wrapper, asserts_xxx don't work
+        # do not use wrapper, asserts_xxx do not work
         ols = ols._results
 
         assert_almost_equal(ols.params, mdf.params, decimal=10)
@@ -1578,7 +1578,7 @@ def test_cov_type(self):
         res_robust_bc = mod.fit(start_params=self.start_params,
                                 cov_type='bias_reduced')
 
-        # call summary to make sure it doesn't change cov_type
+        # call summary to make sure it does not change cov_type
         res_naive.summary()
         res_robust_bc.summary()
 
@@ -1607,7 +1607,7 @@ def test_cov_type(self):
             assert_allclose(res.cov_params(), cov, rtol=rtol, atol=1e-10)
             assert_allclose(res.cov_params_default, cov, rtol=rtol, atol=1e-10)
 
-        # assert that we don't have a copy
+        # assert that we do not have a copy
         assert_(res_robust.cov_params_default is res_robust.cov_robust)
         assert_(res_naive.cov_params_default is res_naive.cov_naive)
         assert_(res_robust_bc.cov_params_default is
diff --git a/statsmodels/genmod/tests/test_glm.py b/statsmodels/genmod/tests/test_glm.py
index 3008ada193c..a73d42ffde8 100644
--- a/statsmodels/genmod/tests/test_glm.py
+++ b/statsmodels/genmod/tests/test_glm.py
@@ -82,7 +82,7 @@ def test_residuals(self):
 
     def test_aic_R(self):
         # R includes the estimation of the scale as a lost dof
-        # Doesn't with Gamma though
+        # Does not with Gamma though
         if self.res1.scale != 1:
             dof = 2
         else:
@@ -255,7 +255,7 @@ def setup_class(cls):
 
     def test_compare_OLS(self):
         res1 = self.res1
-        # OLS doesn't define score_obs
+        # OLS does not define score_obs
         from statsmodels.regression.linear_model import OLS
         resd = OLS(self.data.endog, self.data.exog).fit()
         self.resd = resd  # attach to access from the outside
@@ -511,7 +511,7 @@ def setup_class(cls):
         cls.res1 = res1
 #        res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma)
         res2 = Scotvote()
-        res2.aic_R += 2 # R doesn't count degree of freedom for scale with gamma
+        res2.aic_R += 2 # R does not count degree of freedom for scale with gamma
         cls.res2 = res2
 
 class TestGlmGammaLog(CheckModelResultsMixin):
@@ -686,14 +686,14 @@ def setup_class(cls):
                 family=fam).fit(scale='x2')
         from .results.results_glm import Committee
         res2 = Committee()
-        res2.aic_R += 2 # They don't count a degree of freedom for the scale
+        res2.aic_R += 2 # They do not count a degree of freedom for the scale
         cls.res2 = res2
 
 # FIXME: enable or delete
 #    def setup(self):
 #        if skipR:
 #            raise SkipTest, "Rpy not installed"
-#        r.library('MASS')  # this doesn't work when done in rmodelwrap?
+#        r.library('MASS')  # this does not work when done in rmodelwrap?
 #        self.res2 = RModel(self.data.endog, self.data.exog, r.glm,
 #                family=r.negative_binomial(1))
 #        self.res2.null_deviance = 27.8110469364343
@@ -1321,7 +1321,7 @@ def test_standard_errors(self):
 
     decimal_resids = DECIMAL_4
 
-    # TODO: This doesn't work... Arrays are of different shape.
+    # TODO: This does not work... Arrays are of different shape.
     # Perhaps we use self.res1.model.family.resid_XXX()?
     """
     def test_residuals(self):
@@ -1340,7 +1340,7 @@ def test_residuals(self):
 
     def test_aic(self):
         # R includes the estimation of the scale as a lost dof
-        # Doesn't with Gamma though
+        # Does not with Gamma though
         assert_allclose(self.res1.aic, self.res2.aic,  atol=1e-6, rtol=1e-6)
 
     def test_deviance(self):
@@ -1548,7 +1548,7 @@ class TestWtdGlmInverseGaussian(CheckWtdDuplicationMixin):
     @classmethod
     def setup_class(cls):
         '''
-        Tests InverseGuassian family with log link.
+        Tests InverseGaussian family with log link.
         '''
         super(TestWtdGlmInverseGaussian, cls).setup_class()
         family_link = sm.families.InverseGaussian(sm.families.links.log())
@@ -2006,7 +2006,7 @@ def test_tweedie_EQL_upper_limit():
 def testTweediePowerEstimate():
     # Test the Pearson estimate of the Tweedie variance and scale parameters.
     #
-    # Ideally, this would match the following R code, but I can't make it work...
+    # Ideally, this would match the following R code, but I cannot make it work...
     #
     # setwd('c:/workspace')
     # data <- read.csv('cpunish.csv', sep=",")
diff --git a/statsmodels/genmod/tests/test_glm_weights.py b/statsmodels/genmod/tests/test_glm_weights.py
index d3457a923ae..670a2fe2b17 100644
--- a/statsmodels/genmod/tests/test_glm_weights.py
+++ b/statsmodels/genmod/tests/test_glm_weights.py
@@ -74,7 +74,7 @@ def test_basic(self):
             # Binomial ll and deviance are different for 1d vs. counts...
             return None
         if isinstance(self, TestGlmGaussianWLS):
-            # This won't work right now either
+            # This will not work right now either
             return None
         if not isinstance(self, (TestGlmGaussianAwNr, TestGlmGammaAwNr)):
             # Matching R is hard
@@ -87,7 +87,7 @@ def test_residuals(self):
                              TestTweedieRepeatedvsAverage,
                              TestBinomial0RepeatedvsAverage,
                              TestBinomial0RepeatedvsDuplicated)):
-            # This won't match as different number of records
+            # This will not match as different number of records
             return None
         res1 = self.res1
         res2 = self.res2
@@ -101,7 +101,7 @@ def test_residuals(self):
         assert_allclose(res1.resid_working, resid_all['resid_working'], atol= 1e-6, rtol=2e-6)
         if resid_all.get('resid_anscombe') is None:
             return None
-        # Stata doesn't use var_weights in anscombe residuals, it seems.
+        # Stata does not use var_weights in anscombe residuals, it seems.
         # Adjust residuals to match our approach.
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=FutureWarning)
@@ -371,7 +371,7 @@ def test_r_llf(self):
         # SM uses (essentially) stat's loglike formula... first term is
         # (endog - mu) ** 2 / scale
         adj_sm = -1 / 2 * ((model.endog - res1.mu) ** 2).sum() / scale
-        # R has these 2 terms that stata/sm don't
+        # R has these 2 terms that stata/sm do not
         adj_r = -model.wnobs / 2 + np.sum(np.log(model.var_weights)) / 2
         llf_adj = llf - adj_sm + adj_r
         assert_allclose(llf_adj, res2.ll, atol=1e-6, rtol=1e-7)
@@ -450,10 +450,10 @@ def test_wtd_gradient_irls():
                 if family_class != fam.Binomial and binom_version == 1:
                     continue
                 elif family_class == fam.Binomial and link == lnk.cloglog:
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     continue
                 elif family_class == fam.Binomial and link == lnk.log:
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     continue
                 elif (family_class, link) == (fam.Poisson, lnk.identity):
                     lin_pred = 20 + exog.sum(1)
@@ -462,27 +462,27 @@ def test_wtd_gradient_irls():
                 elif (family_class, link) == (fam.Poisson, lnk.sqrt):
                     lin_pred = -2 + exog.sum(1)
                 elif (family_class, link) == (fam.Gamma, lnk.log):
-                    # Can't get gradient to converge with var_weights here
+                    # Cannot get gradient to converge with var_weights here
                     continue
                 elif (family_class, link) == (fam.Gamma, lnk.identity):
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     continue
                 elif (family_class, link) == (fam.Gamma, lnk.inverse_power):
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     continue
                 elif (family_class, link) == (fam.Gaussian, lnk.log):
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     continue
                 elif (family_class, link) == (fam.Gaussian, lnk.inverse_power):
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     continue
                 elif (family_class, link) == (fam.InverseGaussian, lnk.log):
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     lin_pred = -1 + exog.sum(1)
                     continue
                 elif (family_class, link) == (fam.InverseGaussian,
                                               lnk.identity):
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     lin_pred = 20 + 5*exog.sum(1)
                     lin_pred = np.clip(lin_pred, 1e-4, np.inf)
                     continue
@@ -505,7 +505,7 @@ def test_wtd_gradient_irls():
                     continue  # skip due to non-convergence
                 elif (family_class, link) == (fam.NegativeBinomial,
                                               lnk.inverse_power):
-                    # Can't get gradient to converage with var_weights here
+                    # Cannot get gradient to converage with var_weights here
                     lin_pred = 1 + exog.sum(1) / 5
                     continue
 
diff --git a/statsmodels/graphics/factorplots.py b/statsmodels/graphics/factorplots.py
index 0d2d802cba8..1a87f876808 100644
--- a/statsmodels/graphics/factorplots.py
+++ b/statsmodels/graphics/factorplots.py
@@ -17,7 +17,7 @@ def interaction_plot(x, trace, response, func=np.mean, ax=None, plottype='b',
     Interaction plot for factor level statistics.
 
     Note. If categorial factors are supplied levels will be internally
-    recoded to integers. This ensures matplotlib compatiblity.
+    recoded to integers. This ensures matplotlib compatibility.
 
     uses pandas.DataFrame to calculate an `aggregate` statistic for each
     level of the factor or group given by `trace`.
diff --git a/statsmodels/graphics/functional.py b/statsmodels/graphics/functional.py
index 20b04820a6c..f67126a9326 100644
--- a/statsmodels/graphics/functional.py
+++ b/statsmodels/graphics/functional.py
@@ -476,7 +476,7 @@ def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD',
 
     A functional boxplot is the analog of a boxplot for functional data.
     Functional data is any type of data that varies over a continuum, i.e.
-    curves, probabillity distributions, seasonal data, etc.
+    curves, probability distributions, seasonal data, etc.
 
     The data is first ordered, the order statistic used here is `banddepth`.
     Plotted are then the median curve, the envelope of the 50% central region,
@@ -543,7 +543,7 @@ def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD',
 
     Outliers are defined as curves that fall outside the band created by
     multiplying the central region by `wfactor`.  Note that the range over
-    which they fall outside this band doesn't matter, a single data point
+    which they fall outside this band does not matter, a single data point
     outside the band is enough.  If the data is noisy, smoothing may therefore
     be required.
 
diff --git a/statsmodels/graphics/gofplots.py b/statsmodels/graphics/gofplots.py
index cdefa47ced9..fae2d321e01 100644
--- a/statsmodels/graphics/gofplots.py
+++ b/statsmodels/graphics/gofplots.py
@@ -694,7 +694,7 @@ def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
 
     Examples
     --------
-    Import the food expenditure dataset.  Plot annual food expendeture on x-axis
+    Import the food expenditure dataset.  Plot annual food expenditure on x-axis
     and household income on y-axis.  Use qqline to add regression line into the
     plot.
 
@@ -728,7 +728,7 @@ def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
         raise ValueError("If line is not 45, x and y cannot be None.")
     elif line == 'r':
         # could use ax.lines[0].get_xdata(), get_ydata(),
-        # but don't know axes are 'clean'
+        # but do not know axes are 'clean'
         y = OLS(y, add_constant(x)).fit().fittedvalues
         ax.plot(x,y,fmt)
     elif line == 's':
diff --git a/statsmodels/graphics/mosaicplot.py b/statsmodels/graphics/mosaicplot.py
index c227db04b54..d03738a21ce 100644
--- a/statsmodels/graphics/mosaicplot.py
+++ b/statsmodels/graphics/mosaicplot.py
@@ -151,7 +151,7 @@ def _hierarchical_split(count_dict, horizontal=True, gap=0.05):
     count_dict.  This is the function that actually perform the tiling
     for the creation of the mosaic plot.  If the gap array has been specified
     it will insert a corresponding amount of space (proportional to the
-    unit lenght), while retaining the proportionality of the tiles.
+    unit length), while retaining the proportionality of the tiles.
 
     Parameters
     ----------
@@ -159,14 +159,14 @@ def _hierarchical_split(count_dict, horizontal=True, gap=0.05):
         Dictionary containing the contingency table.
         Each category should contain a non-negative number
         with a tuple as index.  It expects that all the combination
-        of keys to be representes; if that is not true, will
+        of keys to be represents; if that is not true, will
         automatically consider the missing values as 0
     horizontal : bool
         The starting direction of the split (by default along
         the horizontal axis)
     gap : float or array of floats
         The list of gaps to be applied on each subdivision.
-        If the lenght of the given array is less of the number
+        If the length of the given array is less of the number
         of subcategories (or if it's a single number) it will extend
         it with exponentially decreasing gaps
 
@@ -229,7 +229,7 @@ def _create_default_properties(data):
     first it will varies the color hue (first category) then the color
     saturation (second category) and then the color value
     (third category).  If a fourth category is found, it will put
-    decoration on the rectangle.  Doesn't manage more than four
+    decoration on the rectangle.  Does not manage more than four
     level of categories
     """
     categories_levels = _categories_level(list(iterkeys(data)))
@@ -309,7 +309,7 @@ def _normalize_data(data, index):
     data = contingency
     # reorder the keys order according to the one specified by the user
     # or if the index is None convert it into a simple list
-    # right now it doesn't do any check, but can be modified in the future
+    # right now it does not do any check, but can be modified in the future
     index = lrange(len(categories_levels)) if index is None else index
     contingency = OrderedDict()
     for key, value in iteritems(data):
@@ -481,7 +481,7 @@ def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005,
         The contingency table that contains the data.
         Each category should contain a non-negative number
         with a tuple as index.  It expects that all the combination
-        of keys to be representes; if that is not true, will
+        of keys to be represents; if that is not true, will
         automatically consider the missing values as 0.  The order
         of the keys will be the same as the one of insertion.
         If a dict of a Series (or any other dict like object)
@@ -490,7 +490,7 @@ def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005,
         numerical labels.
     index: list, optional
         Gives the preferred order for the category ordering. If not specified
-        will default to the given order.  It doesn't support named indexes
+        will default to the given order.  It does not support named indexes
         for hierarchical Series.  If a DataFrame is provided, it expects
         a list with the name of the columns.
     ax : matplotlib.Axes, optional
@@ -501,7 +501,7 @@ def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005,
         the horizontal axis)
     gap : float or array of floats
         The list of gaps to be applied on each subdivision.
-        If the lenght of the given array is less of the number
+        If the length of the given array is less of the number
         of subcategories (or if it's a single number) it will extend
         it with exponentially decreasing gaps
     labelizer : function (key) -> string, optional
@@ -587,7 +587,7 @@ def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005,
     >>> mosaic(data, title='hierarchical index series')
     >>> plt.show()
 
-    The third accepted data structureis the np array, for which a
+    The third accepted data structure is the np array, for which a
     very simple index will be created.
 
     >>> rand = np.random.random
diff --git a/statsmodels/graphics/plot_grids.py b/statsmodels/graphics/plot_grids.py
index cb5914dc51d..009bfaa0813 100644
--- a/statsmodels/graphics/plot_grids.py
+++ b/statsmodels/graphics/plot_grids.py
@@ -114,7 +114,7 @@ def scatter_ellipse(data, level=0.9, varnames=None, ell_kwds=None,
         for j in range(i):
             #print i,j, i*(nvars-1)+j+1
             ax = fig.add_subplot(nvars-1, nvars-1, (i-1)*(nvars-1)+j+1)
-##                                 #sharey=ax_last) #sharey doesn't allow empty ticks?
+##                                 #sharey=ax_last) #sharey does not allow empty ticks?
 ##            if j == 0:
 ##                print 'new ax_last', j
 ##                ax_last = ax
diff --git a/statsmodels/graphics/regressionplots.py b/statsmodels/graphics/regressionplots.py
index ee5dad156d6..9d26393d9a9 100644
--- a/statsmodels/graphics/regressionplots.py
+++ b/statsmodels/graphics/regressionplots.py
@@ -187,7 +187,7 @@ def plot_regress_exog(results, exog_idx, fig=None):
     Load the Statewide Crime data set and build a model with regressors
     including the rate of high school graduation (hs_grad), population in urban
     areas (urban), households below poverty line (poverty), and single person
-    households (single).  Outcome variable is the muder rate (murder).
+    households (single).  Outcome variable is the murder rate (murder).
 
     Build a 2 by 2 figure based on poverty showing fitted versus actual murder
     rate, residuals versus the poverty rate, partial regression plot of poverty,
@@ -284,7 +284,7 @@ def _partial_regression(endog, exog_i, exog_others):
          exog_others
 
     """
-    #FIXME: This function doesn't appear to be used.
+    #FIXME: This function does not appear to be used.
     res1a = OLS(endog, exog_others).fit()
     res1b = OLS(exog_i, exog_others).fit()
     res1c = OLS(res1a.resid, res1b.resid).fit()
@@ -321,7 +321,7 @@ def plot_partregress(endog, exog_i, exog_others, data=None,
         labels. If obs_labels is a boolean, the point labels will try to do
         the right thing. First it will try to use the index of data, then
         fall back to the index of exog_i. Alternatively, you may give an
-        array-like object corresponding to the obseveration numbers.
+        array-like object corresponding to the observation numbers.
     labels_kwargs : dict
         Keyword arguments that control annotate for the observation labels.
     ax : Matplotlib AxesSubplot instance, optional
@@ -525,7 +525,7 @@ def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None):
     exog = results.model.exog
 
     k_vars = exog.shape[1]
-    # this function doesn't make sense if k_vars=1
+    # this function does not make sense if k_vars=1
 
     nrows = (len(exog_idx) + 1) // 2
     ncols = 1 if nrows == len(exog_idx) else 2
@@ -778,7 +778,7 @@ def abline_plot(intercept=None, slope=None, horiz=None, vert=None,
     .. plot:: plots/graphics_regression_abline.py
 
     """
-    if ax is not None:  # get axis limits first thing, don't change these
+    if ax is not None:  # get axis limits first thing, do not change these
         x = ax.get_xlim()
     else:
         x = None
@@ -1091,7 +1091,7 @@ def ceres_resids(results, focus_exog, frac=0.66, cond_means=None):
     if cond_means is None:
 
         # Below we calculate E[x | focus] where x is each column other
-        # than the focus column.  We don't want the intercept when we do
+        # than the focus column.  We do not want the intercept when we do
         # this so we remove it here.
         pexog = model.exog[:, ix_nf]
         pexog -= pexog.mean(0)
diff --git a/statsmodels/imputation/mice.py b/statsmodels/imputation/mice.py
index 700c621212d..8a12d2e30af 100644
--- a/statsmodels/imputation/mice.py
+++ b/statsmodels/imputation/mice.py
@@ -154,7 +154,7 @@ class MICEData(object):
     Parameters
     ----------
     data : Pandas data frame
-        The data set, whch is copied internally.
+        The data set, which is copied internally.
     perturbation_method : string
         The default perturbation method
     k_pmm : int
@@ -339,7 +339,7 @@ def set_imputer(self, endog_name, formula=None, model_class=None,
             If regularized[name]=True, `fit_regularized` rather than
             `fit` is called when fitting imputation models for this
             variable.  When regularized[name]=True for any variable,
-            pertrurbation_method must be set to boot.
+            perturbation_method must be set to boot.
 
         Notes
         -----
diff --git a/statsmodels/iolib/foreign.py b/statsmodels/iolib/foreign.py
index ea31c4259c2..3191c063fc4 100644
--- a/statsmodels/iolib/foreign.py
+++ b/statsmodels/iolib/foreign.py
@@ -101,14 +101,14 @@ def _stata_elapsed_date_to_datetime(date, fmt):
     date - ty
         years since 0000
 
-    If you don't have pandas with datetime support, then you can't do
+    If you do not have pandas with datetime support, then you cannot do
     milliseconds accurately.
     """
     #NOTE: we could run into overflow / loss of precision situations here
-    # casting to int, but I'm not sure what to do. datetime won't deal with
-    # numpy types and numpy datetime isn't mature enough / we can't rely on
+    # casting to int, but I'm not sure what to do. datetime will not deal with
+    # numpy types and numpy datetime is not mature enough / we cannot rely on
     # pandas version > 0.7.1
-    #TODO: IIRC relative delta doesn't play well with np.datetime?
+    #TODO: IIRC relative delta does not play well with np.datetime?
     date = int(date)
     stata_epoch = datetime.datetime(1960, 1, 1)
     if fmt in ["%tc", "tc"]:
@@ -140,7 +140,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
     elif fmt in ["%ty", "ty"]:
         if date > 0:
             return datetime.datetime(date, 1, 1)
-        else: # don't do negative years bc can't mix dtypes in column
+        else: # do not do negative years bc cannot mix dtypes in column
             raise ValueError("Year 0 and before not implemented")
     else:
         raise ValueError("Date fmt %s not understood" % fmt)
@@ -900,7 +900,7 @@ def _write_data_nodates(self):
                             var = _type_converters[typ](var)
                         self._write(pack(byteorder+TYPE_MAP[typ], var))
                     except struct_error:
-                        # have to be strict about type pack won't do any
+                        # have to be strict about type pack will not do any
                         # kind of casting
                         self._write(pack(byteorder+TYPE_MAP[typ],
                                     _type_converters[typ](var)))
@@ -993,7 +993,7 @@ def genfromdta(fname, missing_flt=-999., encoding=None, pandas=False,
     data = np.zeros((nobs), dtype=dt) # init final array
 
     for rownum,line in enumerate(stata_dta):
-        # doesn't handle missing value objects, just casts
+        # does not handle missing value objects, just casts
         # None will only work without missing value object.
         if None in line:
             for i,val in enumerate(line):
diff --git a/statsmodels/iolib/openfile.py b/statsmodels/iolib/openfile.py
index 757a0a0fe0b..5909c066cf3 100644
--- a/statsmodels/iolib/openfile.py
+++ b/statsmodels/iolib/openfile.py
@@ -17,7 +17,7 @@ def __enter__(self):
         return self._obj
 
     def __exit__(self, *args):
-        '''Don't hide anything'''
+        '''Do not hide anything'''
         return False
 
     def __getattr__(self, name):
diff --git a/statsmodels/iolib/summary.py b/statsmodels/iolib/summary.py
index 887cfc6dc0d..c8973c1c6b1 100644
--- a/statsmodels/iolib/summary.py
+++ b/statsmodels/iolib/summary.py
@@ -292,7 +292,7 @@ def summary_top(results, title=None, gleft=None, gright=None, yname=None, xname=
           ('No. Observations:', lambda: [d_or_f(results.nobs)]),
           ('Df Model:', lambda: [d_or_f(results.df_model)]),
           ('Df Residuals:', lambda: [d_or_f(results.df_resid)]),
-          ('Log-Likelihood:', lambda: ["%#8.5g" % results.llf])  # doesn't exist for RLM - exception
+          ('Log-Likelihood:', lambda: ["%#8.5g" % results.llf])  # does not exist for RLM - exception
     ])
 
     if title is None:
@@ -347,7 +347,7 @@ def summary_top(results, title=None, gleft=None, gright=None, yname=None, xname=
             # fill up with blank lines to same length, just to keep it symmetric
             gen_left += [(' ', ' ')] * (len(gen_right) - len(gen_left))
 
-        # padding in SimpleTable doesn't work like I want
+        # padding in SimpleTable does not work like I want
         #force extra spacing and exact string length in right table
         gen_right = [('%-21s' % ('  '+k), v) for k,v in gen_right]
         gen_stubs_right, gen_data_right = zip_longest(*gen_right) #transpose row col
@@ -411,7 +411,7 @@ def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True,
 
     if isinstance(results, tuple):
         # for multivariate endog
-        # TODO: check whether I don't want to refactor this
+        # TODO: check whether I do not want to refactor this
         #we need to give parameter alpha to conf_int
         results, params, std_err, tvalues, pvalues, conf_int = results
     else:
@@ -492,7 +492,7 @@ def summary_params_frame(results, yname=None, xname=None, alpha=.05,
 
     if isinstance(results, tuple):
         # for multivariate endog
-        # TODO: check whether I don't want to refactor this
+        # TODO: check whether I do not want to refactor this
         #we need to give parameter alpha to conf_int
         results, params, std_err, tvalues, pvalues, conf_int = results
     else:
@@ -625,7 +625,7 @@ def summary_params_2dflat(result, endog_names=None, exog_names=None, alpha=0.05,
             raise ValueError('endog_names has wrong length')
         n_equ = 1
 
-    #VAR doesn't have conf_int
+    #VAR does not have conf_int
     #params = res.params.T # this is a convention for multi-eq models
 
     # check that we have the right length of names
diff --git a/statsmodels/iolib/summary2.py b/statsmodels/iolib/summary2.py
index 96c12f6d162..15374f5df26 100644
--- a/statsmodels/iolib/summary2.py
+++ b/statsmodels/iolib/summary2.py
@@ -306,7 +306,7 @@ def time_now(*args, **kwds):
         try:
             out[key] = func(results)
         except (AttributeError, KeyError, NotImplementedError):
-            # NOTE: some models don't have loglike defined (RLM),
+            # NOTE: some models do not have loglike defined (RLM),
             #   so raise NotImplementedError
             pass
     return out
diff --git a/statsmodels/iolib/table.py b/statsmodels/iolib/table.py
index be033b93361..72339cc8a43 100644
--- a/statsmodels/iolib/table.py
+++ b/statsmodels/iolib/table.py
@@ -368,7 +368,7 @@ def as_text(self, **fmt_dict):
         fmt = self._get_fmt('txt', **fmt_dict)
         # get rows formatted as strings
         formatted_rows = [row.as_string('text', **fmt) for row in self]
-        rowlen = len(formatted_rows[-1])  # don't use header row
+        rowlen = len(formatted_rows[-1])  # do not use header row
 
         # place decoration above the table body, if desired
         table_dec_above = fmt.get('table_dec_above', '=')
@@ -618,7 +618,7 @@ def _decorate_below(self, row_as_string, output_format, **fmt_dict):
             elif output_format == 'latex':
                 result = row_as_string + "\n" + dec_below
             else:
-                raise ValueError("I can't decorate a %s header." %
+                raise ValueError("I cannot decorate a %s header." %
                                  output_format)
         return result
 
diff --git a/statsmodels/iolib/tests/test_foreign.py b/statsmodels/iolib/tests/test_foreign.py
index 47384a4af1a..d49c64e2575 100644
--- a/statsmodels/iolib/tests/test_foreign.py
+++ b/statsmodels/iolib/tests/test_foreign.py
@@ -132,7 +132,7 @@ def test_stata_writer_pandas():
     if dta5.dtypes[1] is np.dtype('int64'):
         ptesting.assert_frame_equal(dta.reset_index(), dta5)
     else:
-        # don't check index because it has different size, int32 versus int64
+        # do not check index because it has different size, int32 versus int64
         ptesting.assert_frame_equal(dta4, dta5[dta5.columns[1:]])
 
 def test_stata_writer_unicode():
diff --git a/statsmodels/miscmodels/count.py b/statsmodels/miscmodels/count.py
index e5c3ad69979..5e737fe13e4 100644
--- a/statsmodels/miscmodels/count.py
+++ b/statsmodels/miscmodels/count.py
@@ -18,7 +18,7 @@
   -> hessian inverts and bse look ok if row and column are dropped, pinv also works
 * GenericMLE: still get somewhere (where?)
    "CacheWriteWarning: The attribute 'bse' cannot be overwritten"
-* bfgs is too fragile, doesn't come back
+* bfgs is too fragile, does not come back
 * `nm` is slow but seems to work
 * need good start_params and their use in genericmle needs to be checked for
   consistency, set as attribute or method (called as attribute)
@@ -160,7 +160,7 @@ def __init__(self, endog, exog=None, offset=None, missing='none', **kwds):
         super(PoissonOffsetGMLE, self).__init__(endog, exog, missing=missing,
                 **kwds)
 
-#this was added temporarily for bug-hunting, but shouldn't be needed
+#this was added temporarily for bug-hunting, but should not be needed
 #    def loglike(self, params):
 #        return -self.nloglikeobs(params).sum(0)
 
diff --git a/statsmodels/miscmodels/tmodel.py b/statsmodels/miscmodels/tmodel.py
index d0d62e389a7..a82eeb5a735 100644
--- a/statsmodels/miscmodels/tmodel.py
+++ b/statsmodels/miscmodels/tmodel.py
@@ -26,7 +26,7 @@
 TODO
 ----
 * add starting values based on OLS
-* bugs: store_params doesn't seem to be defined, I think this was a module
+* bugs: store_params does not seem to be defined, I think this was a module
         global for debugging - commented out
 * parameter restriction: check whether version with some fixed parameters works
 
diff --git a/statsmodels/miscmodels/try_mlecov.py b/statsmodels/miscmodels/try_mlecov.py
index 615677f8cf2..bfaa74ca7dd 100644
--- a/statsmodels/miscmodels/try_mlecov.py
+++ b/statsmodels/miscmodels/try_mlecov.py
@@ -185,7 +185,7 @@ def fit_invertible(self, *args, **kwds):
     #ma = [1]
     np.random.seed(9875789)
     y = arma_generate_sample(ar,ma,nobs,2)
-    y -= y.mean() #I haven't checked treatment of mean yet, so remove
+    y -= y.mean() #I have not checked treatment of mean yet, so remove
     mod = MLEGLS(y)
     mod.nar, mod.nma = 2, 2   #needs to be added, no init method
     mod.nobs = len(y)
diff --git a/statsmodels/multivariate/factor.py b/statsmodels/multivariate/factor.py
index c87af240185..7aba15d4b2e 100644
--- a/statsmodels/multivariate/factor.py
+++ b/statsmodels/multivariate/factor.py
@@ -180,7 +180,7 @@ def fit(self, maxiter=50, tol=1e-8, start=None, opt_method='BFGS',
         maxiter : int
             Maximum number of iterations for iterative estimation algorithms
         tol : float
-            Stopping critera (error tolerance) for iterative estimation
+            Stopping criteria (error tolerance) for iterative estimation
             algorithms
         start : array_like
             Starting values, currently only used for ML estimation
@@ -382,7 +382,7 @@ def score(self, par):
         dl += 2*luz
         dl -= 2*np.dot(lud, luz)
 
-        # Can't use _pack because we are working with the square root
+        # Cannot use _pack because we are working with the square root
         # uniquenesses directly.
         return -np.concatenate((du, dl.T.flat)) / (2*self.k_endog)
 
@@ -509,7 +509,7 @@ class FactorResults(object):
         Each column is the loading vector for one factor
     loadings_no_rot : ndarray
         Unrotated loadings, not available under maximum likelihood
-        analyis.
+        analysis.
     eigenvalues : ndarray
         The eigenvalues for a factor analysis obtained using
         principal components; not available under ML estimation.
@@ -791,7 +791,7 @@ def get_loadings_frame(self, style='display', sort_=True, threshold=0.3,
                applied
             * 'display' add sorting and styling as defined by other keywords
             * 'strings' returns a DataFrame with string elements with optional sorting
-               and surpressing small loading coefficients.
+               and suppressing small loading coefficients.
 
         sort_ : boolean
             If True, then the rows of the DataFrame is sorted by contribution of each
diff --git a/statsmodels/multivariate/factor_rotation/_analytic_rotation.py b/statsmodels/multivariate/factor_rotation/_analytic_rotation.py
index 98550044d3e..6d2461129cf 100644
--- a/statsmodels/multivariate/factor_rotation/_analytic_rotation.py
+++ b/statsmodels/multivariate/factor_rotation/_analytic_rotation.py
@@ -54,7 +54,7 @@ def target_rotation(A, H, full_rank=False):
     [2] Schonemann (1966) - A generalized solution of the orthogonal
     procrustes problem
 
-    [3] Gower, Dijksterhuis (2004) - Procustes problems
+    [3] Gower, Dijksterhuis (2004) - Procrustes problems
     """
     ATH = A.T.dot(H)
     if full_rank or np.linalg.matrix_rank(ATH) == A.shape[1]:
@@ -112,7 +112,7 @@ def promax(A, k=2):
 
     Promax rotation is performed in the following steps:
 
-    * Deterine varimax rotated patterns :math:`V`.
+    * Determine varimax rotated patterns :math:`V`.
 
     * Construct a rotation target matrix :math:`|V_{ij}|^k/V_{ij}`
 
diff --git a/statsmodels/multivariate/factor_rotation/_wrappers.py b/statsmodels/multivariate/factor_rotation/_wrappers.py
index 49bf18d58e0..0be57aaefae 100644
--- a/statsmodels/multivariate/factor_rotation/_wrappers.py
+++ b/statsmodels/multivariate/factor_rotation/_wrappers.py
@@ -55,7 +55,7 @@ def rotate_factors(A, method, *method_args, **algorithm_kwargs):
             stop criterion, algorithm stops if Frobenius norm of gradient is
             smaller then tol
 
-        For analytic, the supporeted arguments depend on the method, see above.
+        For analytic, the supported arguments depend on the method, see above.
 
         See the lower level functions for more details.
 
diff --git a/statsmodels/nonparametric/_kernel_base.py b/statsmodels/nonparametric/_kernel_base.py
index 251c78176da..c86f21032b1 100644
--- a/statsmodels/nonparametric/_kernel_base.py
+++ b/statsmodels/nonparametric/_kernel_base.py
@@ -412,7 +412,7 @@ class LeaveOneOut(object):
 
     Notes
     -----
-    A little lighter weight than sklearn LOO. We don't need test index.
+    A little lighter weight than sklearn LOO. We do not need test index.
     Also passes views on X, not the index.
     """
     def __init__(self, X):
diff --git a/statsmodels/nonparametric/_smoothers_lowess.pyx b/statsmodels/nonparametric/_smoothers_lowess.pyx
index 017c961753a..1fa559c7d6d 100644
--- a/statsmodels/nonparametric/_smoothers_lowess.pyx
+++ b/statsmodels/nonparametric/_smoothers_lowess.pyx
@@ -198,7 +198,7 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,
             if last_fit_i >= n-1:
                 break
 
-        # Calculate residual weights, but don't bother on the last iteration.
+        # Calculate residual weights, but do not bother on the last iteration.
         if robiter < it - 1:
             resid_weights = calculate_residual_weights(y, y_fit)
 
diff --git a/statsmodels/nonparametric/kde.py b/statsmodels/nonparametric/kde.py
index 7e9625ad28f..2658b95c437 100644
--- a/statsmodels/nonparametric/kde.py
+++ b/statsmodels/nonparametric/kde.py
@@ -318,7 +318,7 @@ def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None
     nobs = len(X) # after trim
 
     if gridsize is None:
-        gridsize = max(nobs,50) # don't need to resize if no FFT
+        gridsize = max(nobs,50) # do not need to resize if no FFT
 
         # handle weights
     if weights is None:
@@ -354,7 +354,7 @@ def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None
     kern.seth(bw)
 
     # truncate to domain
-    if kern.domain is not None: # won't work for piecewise kernels like parzen
+    if kern.domain is not None: # will not work for piecewise kernels like parzen
         z_lo, z_high = kern.domain
         domain_mask = (k < z_lo) | (k > z_high)
         k = kern(k) # estimate density
@@ -422,7 +422,7 @@ def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=N
 
     Notes
     -----
-    Only the default kernel is implemented. Weights aren't implemented yet.
+    Only the default kernel is implemented. Weights are not implemented yet.
     This follows Silverman (1982) with changes suggested by Jones and Lotwick
     (1984). However, the discretization step is replaced by linear binning
     of Fan and Marron (1994). This should be extended to accept the parts
@@ -442,7 +442,7 @@ def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=N
         Series C. 31.2, 93-9.
     """
     X = np.asarray(X)
-    X = X[np.logical_and(X > clip[0], X < clip[1])] # won't work for two columns.
+    X = X[np.logical_and(X > clip[0], X < clip[1])] # will not work for two columns.
                                                 # will affect underlying data?
 
     # Get kernel object corresponding to selection
@@ -488,7 +488,7 @@ def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=N
     y = forrt(binned)
 
     # step 3 and 4 for optimal bw compute zstar and the density estimate f
-    # don't have to redo the above if just changing bw, ie., for cross val
+    # do not have to redo the above if just changing bw, ie., for cross val
 
 #NOTE: silverman_transform is the closed form solution of the FFT of the
 #gaussian kernel. Not yet sure how to generalize it.
@@ -544,4 +544,4 @@ def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=N
     except:
 #        ft = np.loadtxt('./ft_silver.csv')
 #        smooth = np.loadtxt('./smooth_silver.csv')
-        print("Didn't get the estimates from the Silverman algorithm")
+        print("Did not get the estimates from the Silverman algorithm")
diff --git a/statsmodels/nonparametric/kernel_regression.py b/statsmodels/nonparametric/kernel_regression.py
index 19d3be30b18..9d58abdfbfc 100644
--- a/statsmodels/nonparametric/kernel_regression.py
+++ b/statsmodels/nonparametric/kernel_regression.py
@@ -472,7 +472,7 @@ class KernelCensoredReg(KernelReg):
     bw: array_like
         Either a user-specified bandwidth or
         the method for bandwidth selection.
-        cv_ls: cross-validaton least squares
+        cv_ls: cross-validation least squares
         aic: AIC Hurvich Estimator
     censor_val: float
         Value at which the dependent variable is censored
@@ -761,7 +761,7 @@ def _compute_lambda(self, Y, X):
         b = b[:, self.test_vars]
         b = np.reshape(b, (n, len(self.test_vars)))
         #fct = np.std(b)  # Pivot the statistic by dividing by SE
-        fct = 1.  # Don't Pivot -- Bootstrapping works better if Pivot
+        fct = 1.  # Do not Pivot -- Bootstrapping works better if Pivot
         lam = ((b / fct) ** 2).sum() / float(n)
         return lam
 
@@ -851,7 +851,7 @@ class TestRegCoefD(TestRegCoefC):
 
     Notes
     -----
-    This class currently doesn't allow joint hypothesis.
+    This class currently does not allow joint hypothesis.
     Only one variable can be tested at a time
 
     References
diff --git a/statsmodels/nonparametric/smoothers_lowess.py b/statsmodels/nonparametric/smoothers_lowess.py
index 106a9ad5cc2..9afd9ea7f6d 100644
--- a/statsmodels/nonparametric/smoothers_lowess.py
+++ b/statsmodels/nonparametric/smoothers_lowess.py
@@ -184,5 +184,5 @@ def lowess(endog, exog, frac=2.0/3.0, it=3, delta=0.0, is_sorted=False,
             yfitted_[mask_valid] = yfitted
             yfitted = yfitted_
 
-        # we don't need to return exog anymore
+        # we do not need to return exog anymore
         return yfitted
diff --git a/statsmodels/nonparametric/tests/test_kernel_regression.py b/statsmodels/nonparametric/tests/test_kernel_regression.py
index faf936f6f95..62d17546f72 100644
--- a/statsmodels/nonparametric/tests/test_kernel_regression.py
+++ b/statsmodels/nonparametric/tests/test_kernel_regression.py
@@ -175,7 +175,7 @@ def test_mixed_mfx_ll_cvls(self, file_name='RegData.csv'):
         npt.assert_allclose(sm_mfx[0, :], [b1, b2, b3], rtol=2e-1)
 
     @pytest.mark.slow
-    @pytest.mark.xfail(reason="Test doesn't make much sense - always passes "
+    @pytest.mark.xfail(reason="Test does not make much sense - always passes "
                               "with very small bw.")
     def test_mfx_nonlinear_ll_cvls(self, file_name='RegData.csv'):
         nobs = 200
diff --git a/statsmodels/nonparametric/tests/test_kernels.py b/statsmodels/nonparametric/tests/test_kernels.py
index 172d69a2eb8..774ddf148c6 100644
--- a/statsmodels/nonparametric/tests/test_kernels.py
+++ b/statsmodels/nonparametric/tests/test_kernels.py
@@ -27,7 +27,7 @@
 xg = np.linspace(x.min(), x.max(), 40) # grid points default in Stata
 
 
-# FIXME: don't leave this commented-out; use or move/remove
+# FIXME: do not leave this commented-out; use or move/remove
 #kern_name = 'gau'
 #kern = kernels.Gaussian()
 #kern_name = 'epan2'
@@ -37,7 +37,7 @@
 #kern_name = 'tri'
 #kern = kernels.Triangular()
 #kern_name = 'cos'
-#kern = kernels.Cosine()  #doesn't match up, nan in Stata results ?
+#kern = kernels.Cosine()  #does not match up, nan in Stata results ?
 #kern_name = 'bi'
 #kern = kernels.Biweight()
 
@@ -89,7 +89,7 @@ def test_smoothconf(self):
             # raises: RuntimeWarning: invalid value encountered in divide
             print(fitted / res_fitted - 1)
             print(se / res_se - 1)
-        # Stata only displays ci, doesn't save it
+        # Stata only displays ci, does not save it
         res_upp = res_fitted + crit * res_se
         res_low = res_fitted - crit * res_se
         self.res_fittedg = np.column_stack((res_low, res_fitted, res_upp))
diff --git a/statsmodels/nonparametric/tests/test_lowess.py b/statsmodels/nonparametric/tests/test_lowess.py
index ef764983042..f0372c21458 100644
--- a/statsmodels/nonparametric/tests/test_lowess.py
+++ b/statsmodels/nonparametric/tests/test_lowess.py
@@ -26,7 +26,7 @@
 class TestLowess(object):
 
     def test_import(self):
-        #this doesn't work
+        #this does not work
         #from statsmodels.api.nonparametric import lowess as lowess1
         import statsmodels.api as sm
         lowess1 = sm.nonparametric.lowess
diff --git a/statsmodels/regression/_prediction.py b/statsmodels/regression/_prediction.py
index bf3b895fdf6..d3546b5c84e 100644
--- a/statsmodels/regression/_prediction.py
+++ b/statsmodels/regression/_prediction.py
@@ -83,8 +83,8 @@ def summary_frame(self, what='all', alpha=0.05):
         to_include['obs_ci_upper'] = ci_obs[:, 1]
 
         self.table = to_include
-        #OrderedDict doesn't work to preserve sequence
-        # pandas dict doesn't handle 2d_array
+        #OrderedDict does not work to preserve sequence
+        # pandas dict does not handle 2d_array
         #data = np.column_stack(list(to_include.values()))
         #names = ....
         res = pd.DataFrame(to_include, index=self.row_labels,
diff --git a/statsmodels/regression/linear_model.py b/statsmodels/regression/linear_model.py
index 4f56aeb9522..04b8d354ebc 100644
--- a/statsmodels/regression/linear_model.py
+++ b/statsmodels/regression/linear_model.py
@@ -354,7 +354,7 @@ def predict(self, params, exog=None):
         -----
         If the model has not yet been fit, params is not optional.
         """
-        # JP: this doesn't look correct for GLMAR
+        # JP: this does not look correct for GLMAR
         # SS: it needs its own predict method
 
         if exog is None:
@@ -1237,7 +1237,7 @@ def iterative_fit(self, maxiter=3, rtol=1e-4, **kwds):
                 del self.pinv_wexog
             self.initialize()
 
-        # if converged then this is a duplicate fit, because we didn't
+        # if converged then this is a duplicate fit, because we did not
         # update rho
         results = self.fit(history=history, **kwds)
         results.iter = i + 1
@@ -1845,7 +1845,7 @@ def resid_pearson(self):
             raise ValueError('Method requires residuals.')
         eps = np.finfo(self.wresid.dtype).eps
         if np.sqrt(self.scale) < 10 * eps * self.model.endog.mean():
-            # don't divide if scale is zero close to numerical precision
+            # do not divide if scale is zero close to numerical precision
             from warnings import warn
             warn("All residuals are 0, cannot compute normed residuals.",
                  RuntimeWarning)
@@ -2274,7 +2274,7 @@ def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
             df_correction = kwds.get('df_correction', None)
             # TODO: check also use_correction, do I need all combinations?
             if df_correction is not False:  # i.e. in [None, True]:
-                # user didn't explicitely set it to False
+                # user did not explicitely set it to False
                 adjust_df = True
 
         res.cov_kwds['adjust_df'] = adjust_df
@@ -2464,7 +2464,7 @@ def summary(self, yname=None, xname=None, title=None, alpha=.05):
 
         # TODO: requiring list/iterable is a bit annoying
         #   need more control over formatting
-        # TODO: default don't work if it's not identically spelled
+        # TODO: default do not work if it's not identically spelled
 
         top_left = [('Dep. Variable:', None),
                     ('Model:', None),
diff --git a/statsmodels/regression/mixed_linear_model.py b/statsmodels/regression/mixed_linear_model.py
index c79ccc3dbe0..93d4bdc98d5 100644
--- a/statsmodels/regression/mixed_linear_model.py
+++ b/statsmodels/regression/mixed_linear_model.py
@@ -754,7 +754,7 @@ def __init__(self, endog, exog, groups, exog_re=None,
             self.k_re2 = 0
 
         if not self.data._param_names:
-            # HACK: could've been set in from_formula already
+            # HACK: could have been set in from_formula already
             # needs refactor
             (param_names, exog_re_names,
              exog_re_names_full) = self._make_param_names(exog_re)
@@ -1105,7 +1105,7 @@ def fit_regularized(self, start_params=None, method='l1', alpha=0,
             regularization, the weights are used directly.
         ceps : positive real scalar
             Fixed effects parameters smaller than this value
-            in magnitude are treaded as being zero.
+            in magnitude are treated as being zero.
         ptol : positive real scalar
             Convergence occurs when the sup norm difference
             between successive values of `fe_params` is less than
@@ -1264,7 +1264,7 @@ def get_fe_params(self, cov_re, vcomp):
         else:
             cov_re_inv = np.linalg.inv(cov_re)
 
-        # Cache these quantities that don't change.
+        # Cache these quantities that do not change.
         if not hasattr(self, "_endex_li"):
             self._endex_li = []
             for group_ix, _ in enumerate(self.group_labels):
@@ -2025,7 +2025,7 @@ def fit(self, start_params=None, reml=True, niter_sa=0,
         free : MixedLMParams object
             If not `None`, this is a mask that allows parameters to be
             held fixed at specified values.  A 1 indicates that the
-            correspondinig parameter is estimated, a 0 indicates that
+            corresponding parameter is estimated, a 0 indicates that
             it is fixed at its starting value.  Setting the `cov_re`
             component to the identity matrix fits a model with
             independent random effects.  Note that some optimization
@@ -2705,7 +2705,7 @@ def profile_re(self, re_ix, vtype, num_low=5, dist_low=1., num_high=5,
             begin calculating points on the profile likelihood.
         num_high : integer
             The number of points at which to calculate the likelihood
-            abov the MLE of the parameter of interest.
+            above the MLE of the parameter of interest.
         dist_high : float
             The distance above the MLE of the parameter of interest to
             begin calculating points on the profile likelihood.
diff --git a/statsmodels/regression/quantile_regression.py b/statsmodels/regression/quantile_regression.py
index bf7be7c4ace..4afe35c9ba5 100644
--- a/statsmodels/regression/quantile_regression.py
+++ b/statsmodels/regression/quantile_regression.py
@@ -147,7 +147,7 @@ def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
         beta = np.ones(exog_rank)
         # TODO: better start, initial beta is used only for convergence check
 
-        # Note the following doesn't work yet,
+        # Note the following does not work yet,
         # the iteration loop always starts with OLS as initial beta
 #        if start_params is not None:
 #            if len(start_params) != rank:
@@ -179,7 +179,7 @@ def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
             history['mse'].append(np.mean(resid*resid))
 
             if (n_iter >= 300) and (n_iter % 100 == 0):
-                # check for convergence circle, shouldn't happen
+                # check for convergence circle, should not happen
                 for ii in range(2, 10):
                     if np.all(beta == history['params'][-ii]):
                         cycle = True
diff --git a/statsmodels/regression/recursive_ls.py b/statsmodels/regression/recursive_ls.py
index be77104ddf2..11ec252faa0 100644
--- a/statsmodels/regression/recursive_ls.py
+++ b/statsmodels/regression/recursive_ls.py
@@ -259,7 +259,7 @@ def __init__(self, model, params, filter_results, cov_type='opg',
         super(RecursiveLSResults, self).__init__(
             model, params, filter_results, cov_type, **kwargs)
 
-        # Since we are overriding params with things that aren't MLE params,
+        # Since we are overriding params with things that are not MLE params,
         # need to adjust df's
         q = max(self.loglikelihood_burn, self.k_diffuse_states)
         self.df_model = q - self.model.k_constraints
@@ -500,7 +500,7 @@ def mse_total(self):
 
     def get_prediction(self, start=None, end=None, dynamic=False,
                        index=None, **kwargs):
-        # Note: need to override this, because we currently don't support
+        # Note: need to override this, because we currently do not support
         # dynamic prediction or forecasts when there are constraints.
         if start is None:
             start = self.model._index[0]
@@ -519,7 +519,7 @@ def get_prediction(self, start=None, end=None, dynamic=False,
                                       ' constraints.')
 
         # Perform the prediction
-        # This is a (k_endog x npredictions) array; don't want to squeeze in
+        # This is a (k_endog x npredictions) array; do not want to squeeze in
         # case of npredictions = 1
         prediction_results = self.filter_results.predict(
             start, end + out_of_sample + 1, dynamic, **kwargs)
diff --git a/statsmodels/regression/tests/test_glsar_stata.py b/statsmodels/regression/tests/test_glsar_stata.py
index 08308477d05..eea15fa52a1 100644
--- a/statsmodels/regression/tests/test_glsar_stata.py
+++ b/statsmodels/regression/tests/test_glsar_stata.py
@@ -86,7 +86,7 @@ def test_glsar_iter0(self):
         res1 = mod1.fit()
         res0 = mod1.iterative_fit(0)
         res0b = mod1.iterative_fit(1)
-        # check iterative_fit(0) or iterative_fit(1) doesn't update rho
+        # check iterative_fit(0) or iterative_fit(1) does not update rho
         assert_allclose(res0.params, res1.params, rtol=1e-11)
         assert_allclose(res0b.params, res1.params, rtol=1e-11)
         assert_allclose(res0.model.rho, rho, rtol=1e-11)
diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py
index 9b4f6039a32..c559290d147 100644
--- a/statsmodels/regression/tests/test_lme.py
+++ b/statsmodels/regression/tests/test_lme.py
@@ -151,8 +151,8 @@ def test_compare_numdiff(self, use_sqrt, reml, profile_fe):
                 ngr = nd.approx_fprime(params_vec, loglike)
                 assert_allclose(gr, ngr, rtol=1e-3)
 
-            # Check Hessian matrices at the MLE (we don't have
-            # the profile Hessian matrix and we don't care
+            # Check Hessian matrices at the MLE (we do not have
+            # the profile Hessian matrix and we do not care
             # about the Hessian for the square root
             # transformed parameter).
             if (profile_fe is False) and (use_sqrt is False):
@@ -559,7 +559,7 @@ def test_pastes_vcomp(self):
         # logLik(r)
         assert_allclose(result.llf, -123.49, rtol=1e-1)
 
-        # don't provide aic/bic with REML
+        # do not provide aic/bic with REML
         assert_equal(result.aic, np.nan)
         assert_equal(result.bic, np.nan)
 
diff --git a/statsmodels/regression/tests/test_predict.py b/statsmodels/regression/tests/test_predict.py
index 80631ad581c..2730b465c11 100644
--- a/statsmodels/regression/tests/test_predict.py
+++ b/statsmodels/regression/tests/test_predict.py
@@ -15,7 +15,7 @@
 
 
 def test_predict_se():
-    # this test doesn't use reference values
+    # this test does not use reference values
     # checks conistency across options, and compares to direct calculation
 
     # generate dataset
diff --git a/statsmodels/regression/tests/test_recursive_ls.py b/statsmodels/regression/tests/test_recursive_ls.py
index d76973bbc86..99fa3dee098 100644
--- a/statsmodels/regression/tests/test_recursive_ls.py
+++ b/statsmodels/regression/tests/test_recursive_ls.py
@@ -213,7 +213,7 @@ def test_glm(constraints=None):
     # used in OLS. Compute new ic based on llf_alternative to compare.
     actual_aic = aic(llf_alternative, res.nobs_effective, res.df_model)
     assert_allclose(actual_aic, res_glm.aic)
-    # See gh#1733 for details on why the BIC doesn't match while AIC does
+    # See gh#1733 for details on why the BIC does not match while AIC does
     # actual_bic = bic(llf_alternative, res.nobs_effective, res.df_model)
     # assert_allclose(actual_bic, res_glm.bic)
 
diff --git a/statsmodels/regression/tests/test_regression.py b/statsmodels/regression/tests/test_regression.py
index 565ab0f528e..f0168142e9d 100644
--- a/statsmodels/regression/tests/test_regression.py
+++ b/statsmodels/regression/tests/test_regression.py
@@ -113,7 +113,7 @@ def test_mse_total(self):
 
     decimal_fvalue = DECIMAL_4
     def test_fvalue(self):
-        # didn't change this, not sure it should complain -inf not equal -inf
+        # did not change this, not sure it should complain -inf not equal -inf
         # if not (np.isinf(self.res1.fvalue) and np.isinf(self.res2.fvalue)):
         assert_almost_equal(self.res1.fvalue, self.res2.fvalue,
                             self.decimal_fvalue)
@@ -527,7 +527,7 @@ def test_wrong_size_sigma_2d(self):
         assert_raises(ValueError, GLS, self.endog, self.exog,
                       sigma=np.ones((n-1, n-1)))
 
-# FIXME: don't leave commented-out, use or move/remove
+# FIXME: do not leave commented-out, use or move/remove
 #    def check_confidenceintervals(self, conf1, conf2):
 #        assert_almost_equal(conf1, conf2, DECIMAL_4)
 
@@ -821,7 +821,7 @@ def check_confidenceintervals(self, conf1, conf2):  # FIXME: never called
         assert_almost_equal(conf1, conf2(), DECIMAL_4)
 
 
-# FIXME: don't leave this commented-out sitting here
+# FIXME: do not leave this commented-out sitting here
 # TODO: test AR
 # why the two-stage in AR?
 # class TestAR(object):
diff --git a/statsmodels/regression/tests/test_robustcov.py b/statsmodels/regression/tests/test_robustcov.py
index b93c0493375..5937886306d 100644
--- a/statsmodels/regression/tests/test_robustcov.py
+++ b/statsmodels/regression/tests/test_robustcov.py
@@ -364,7 +364,7 @@ def setup_class(cls):
         dtapa_endog = dtapa.endog[:200]
         dtapa_exog = dtapa.exog[:200]
         exog = add_constant(dtapa_exog[['value', 'capital']], prepend=False)
-        #asserts don't work for pandas
+        #asserts do not work for pandas
         cls.res1 = OLS(dtapa_endog, exog).fit()
 
         firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'),
@@ -718,7 +718,7 @@ def setup_class(cls):
         dtapa_endog = dtapa.endog[:200]
         dtapa_exog = dtapa.exog[:200]
         exog = add_constant(dtapa_exog[['value', 'capital']], prepend=False)
-        #asserts don't work for pandas
+        #asserts do not work for pandas
         cls.res1 = WLS(dtapa_endog, exog, weights=1/dtapa_exog['value']).fit()
 
         firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'),
@@ -817,7 +817,7 @@ def setup_class(cls):
         dtapa_endog = dtapa.endog[:200]
         dtapa_exog = dtapa.exog[:200]
         exog = add_constant(dtapa_exog[['value', 'capital']], prepend=False)
-        #asserts don't work for pandas
+        #asserts do not work for pandas
         cls.res_wls = WLS(dtapa_endog, exog, weights=1/dtapa_exog['value']).fit()
         w_sqrt = 1 / np.sqrt(np.asarray(dtapa_exog['value']))
         cls.res_ols = OLS(dtapa_endog * w_sqrt,
@@ -853,7 +853,7 @@ def test_all(self):
             assert_allclose(res1.cov_params(), res2.cov_params(), rtol=1e-13)
             assert_allclose(res1.bse, res2.bse, rtol=1e-13)
             assert_allclose(res1.pvalues, res2.pvalues, rtol=1e-13)
-            #Note: Fvalue doesn't match up, difference in calculation ?
+            #Note: Fvalue does not match up, difference in calculation ?
             #      The only difference should be in the constant detection
             #assert_allclose(res1.fvalue, res2.fvalue, rtol=1e-13)
             #assert_allclose(res1.f_pvalue, res2.f_pvalue, rtol=1e-13)
diff --git a/statsmodels/robust/robust_linear_model.py b/statsmodels/robust/robust_linear_model.py
index 74fb903dcfa..080b7b8f094 100644
--- a/statsmodels/robust/robust_linear_model.py
+++ b/statsmodels/robust/robust_linear_model.py
@@ -391,7 +391,7 @@ class RLMResults(base.LikelihoodModelResults):
         from the robust covariance matrix specified in the argument to fit.
     weights : array
         The reported weights are determined by passing the scaled residuals
-        from the last weighted least squares fit in the IRLS algortihm.
+        from the last weighted least squares fit in the IRLS algorithm.
 
     See Also
     --------
diff --git a/statsmodels/robust/scale.py b/statsmodels/robust/scale.py
index e9e947970a4..124631f169b 100644
--- a/statsmodels/robust/scale.py
+++ b/statsmodels/robust/scale.py
@@ -28,7 +28,7 @@ def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median):
         The normalization constant.  Defined as scipy.stats.norm.ppf(3/4.),
         which is approximately .6745.
     axis : int, optional
-        The defaul is 0. Can also be None.
+        The default is 0. Can also be None.
     center : callable or float
         If a callable is provided, such as the default `np.median` then it
         is expected to be called center(a). The axis argument will be applied
diff --git a/statsmodels/sandbox/archive/linalg_decomp_1.py b/statsmodels/sandbox/archive/linalg_decomp_1.py
index 8e0c797f517..8ebf3015e22 100644
--- a/statsmodels/sandbox/archive/linalg_decomp_1.py
+++ b/statsmodels/sandbox/archive/linalg_decomp_1.py
@@ -178,7 +178,7 @@ def __init__(self, data=None, sym=None):
 
     def yt_minv_y(self, y):
         '''xSigmainvx
-        doesn't use stored cholesky yet
+        does not use stored cholesky yet
         '''
         return np.dot(x,linalg.cho_solve(linalg.cho_factor(self.m),x))
         #same as
diff --git a/statsmodels/sandbox/bspline.py b/statsmodels/sandbox/bspline.py
index 5dde308ff4d..5ddbc34b67c 100644
--- a/statsmodels/sandbox/bspline.py
+++ b/statsmodels/sandbox/bspline.py
@@ -469,7 +469,7 @@ def fit(self, y, x=None, weights=None, pen=0.):
         else:
             bt = self.basis(x)
 
-        if pen == 0.: # can't use cholesky for singular matrices
+        if pen == 0.: # cannot use cholesky for singular matrices
             banded = False
 
         if x.shape != y.shape:
diff --git a/statsmodels/sandbox/datarich/__init__.py b/statsmodels/sandbox/datarich/__init__.py
index 1e26f0c0994..6d9011f95ed 100644
--- a/statsmodels/sandbox/datarich/__init__.py
+++ b/statsmodels/sandbox/datarich/__init__.py
@@ -40,7 +40,7 @@
 The same tools apply and can be used in these two cases.
 e.g. Tychonov regularization of weighting matrix in GMM, similar to Ridge regression, the
 weighting matrix can be shrunk towards the identity matrix.
-Simplest case will be part of GMM. I don't know how much will be standalone
+Simplest case will be part of GMM. I do not know how much will be standalone
 functions.
 
 
@@ -63,7 +63,7 @@
 Selection criteria based on eigenvalue cutoffs.
 
 Paper on PCA and structural breaks. Could add additional results during
-find_nfact to test for parameter stability. I haven't read the paper yet.
+find_nfact to test for parameter stability. I have not read the paper yet.
 
 Idea: for forecasting, use up to h-step ahead endogenous variables to directly
 get the forecasts.
diff --git a/statsmodels/sandbox/dataset_notes.rst b/statsmodels/sandbox/dataset_notes.rst
index b57ecac948a..846213b4112 100644
--- a/statsmodels/sandbox/dataset_notes.rst
+++ b/statsmodels/sandbox/dataset_notes.rst
@@ -58,7 +58,7 @@ from data import *
 
 8) Edit the datasets.__init__.py to import the new directory
 
-9) Make sure everything is correct, and you've saved everything,
+9) Make sure everything is correct, and you have saved everything,
    and put the directory under version control.
 
 bzr add spector
diff --git a/statsmodels/sandbox/distributions/estimators.py b/statsmodels/sandbox/distributions/estimators.py
index aa77ced3364..140248d3629 100644
--- a/statsmodels/sandbox/distributions/estimators.py
+++ b/statsmodels/sandbox/distributions/estimators.py
@@ -4,7 +4,7 @@
 
 Warning: I'm still finding cut-and-paste and refactoring errors, e.g.
     hardcoded variables from outer scope in functions
-    some results don't seem to make sense for Pareto case,
+    some results do not seem to make sense for Pareto case,
     looks better now after correcting some name errors
 
 initially loosely based on a paper and blog for quantile matching
@@ -32,10 +32,10 @@
 
 example: t-distribution
 * works with quantiles if they contain tail quantiles
-* results with momentcondquant don't look as good as mle estimate
+* results with momentcondquant do not look as good as mle estimate
 
 TODOs
-* rearange and make sure I don't use module globals (as I did initially) DONE
+* rearange and make sure I do not use module globals (as I did initially) DONE
   make two version exactly identified method of moments with fsolve
   and GMM (?) version with fmin
   and maybe the special cases of JD Cook
@@ -136,7 +136,7 @@ def gammamomentcond2(distfn, params, mom2, quantile=None):
 
 
 
-######### fsolve doesn't move in small samples, fmin not very accurate
+######### fsolve does not move in small samples, fmin not very accurate
 def momentcondunbound(distfn, params, mom2, quantile=None):
     '''moment conditions for estimating distribution parameters using method
     of moments, uses mean, variance and one quantile for distributions
@@ -577,7 +577,7 @@ def fit_mps(dist, data, x0=None):
 
     ''' example results:
     standard error for df estimate looks large
-    note: iI don't impose that df is an integer, (b/c not necessary)
+    note: iI do not impose that df is an integer, (b/c not necessary)
     need Monte Carlo to check variance of estimators
 
 
@@ -633,7 +633,7 @@ def fit_mps(dist, data, x0=None):
     #example Maximum Product of Spacings Estimation
 
     # current results:
-    # doesn't look very good yet sensitivity to starting values
+    # does not look very good yet sensitivity to starting values
     # Pareto and Generalized Pareto look like a tough estimation problemprint('\n\nExample: Lognormal Distribution'
 
     print('\n\nExample: Lomax, Pareto, Generalized Pareto Distributions')
diff --git a/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py b/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py
index 45973b8cea1..55801491e04 100644
--- a/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py
+++ b/statsmodels/sandbox/distributions/examples/ex_mvelliptical.py
@@ -97,7 +97,7 @@
 print(res.model.predict(np.array([1,1,1])))
 print(mv2c.mean)
 
-#the following wrong input doesn't raise an exception but produces wrong numbers
+#the following wrong input does not raise an exception but produces wrong numbers
 #mv2c = mvn3.conditional(np.array([0]), [[1, 1],[2,2]])
 
 #************** multivariate t distribution ***************
diff --git a/statsmodels/sandbox/distributions/extras.py b/statsmodels/sandbox/distributions/extras.py
index f9ce91e0d4d..9851d5823f7 100644
--- a/statsmodels/sandbox/distributions/extras.py
+++ b/statsmodels/sandbox/distributions/extras.py
@@ -13,7 +13,7 @@
 
 TODO:
 * Where is Transf_gen for general monotonic transformation ? found and added it
-* write some docstrings, some parts I don't remember
+* write some docstrings, some parts I do not remember
 * add Box-Cox transformation, parameterized ?
 
 
@@ -1025,7 +1025,7 @@ def mvstdnormcdf(lower, upper, corrcoef, **kwds):
 
     '''
     n = len(lower)
-    #don't know if converting to array is necessary,
+    #do not know if converting to array is necessary,
     #but it makes ndim check possible
     lower = np.array(lower)
     upper = np.array(upper)
diff --git a/statsmodels/sandbox/distributions/genpareto.py b/statsmodels/sandbox/distributions/genpareto.py
index efeeaa0d9b7..fa0c9cdb445 100644
--- a/statsmodels/sandbox/distributions/genpareto.py
+++ b/statsmodels/sandbox/distributions/genpareto.py
@@ -98,7 +98,7 @@ def meanexcess(thresh, shape, scale):
 
 def meanexcess_plot(data, params=None, lidx=100, uidx=10, method='emp', plot=0):
     if method == 'est':
-        #doesn't make much sense yet,
+        #does not make much sense yet,
         #estimate the parameters and use theoretical meanexcess
         if params is None:
             raise NotImplementedError
diff --git a/statsmodels/sandbox/distributions/gof_new.py b/statsmodels/sandbox/distributions/gof_new.py
index 01ea33d6821..cc66de16059 100644
--- a/statsmodels/sandbox/distributions/gof_new.py
+++ b/statsmodels/sandbox/distributions/gof_new.py
@@ -214,7 +214,7 @@ def kstest(rvs, cdf, args=(), N=20, alternative = 'two_sided', mode='approx',**k
     >>> kstest(x,'norm', alternative = 'greater')
     (0.0072115233216311081, 0.98531158590396395)
 
-    Don't reject equal distribution against alternative hypothesis: greater
+    Do not reject equal distribution against alternative hypothesis: greater
 
     >>> kstest(x,'norm', mode='asymp')
     (0.12464329735846891, 0.08944488871182088)
@@ -472,7 +472,7 @@ def a(self):
 
     @cache_readonly
     def asqu(self):
-        '''Stephens 1974, doesn't have p-value formula for A^2'''
+        '''Stephens 1974, does not have p-value formula for A^2'''
         nobs = self.nobs
         cdfvals = self.cdfvals
 
diff --git a/statsmodels/sandbox/distributions/mv_measures.py b/statsmodels/sandbox/distributions/mv_measures.py
index b6244f6cc9e..7074da810f9 100644
--- a/statsmodels/sandbox/distributions/mv_measures.py
+++ b/statsmodels/sandbox/distributions/mv_measures.py
@@ -102,7 +102,7 @@ def mutualinfo_binned(y, x, bins, normed=True):
         qbin_sqr = np.sqrt(5./nobs)
         quantiles = np.linspace(0, 1, 1./qbin_sqr)
         quantile_index = ((nobs-1)*quantiles).astype(int)
-        #move edges so that they don't coincide with an observation
+        #move edges so that they do not coincide with an observation
         shift = 1e-6 + np.ones(quantiles.shape)
         shift[0] -= 2*1e-6
         binsy = ys[quantile_index] + shift
diff --git a/statsmodels/sandbox/distributions/mv_normal.py b/statsmodels/sandbox/distributions/mv_normal.py
index 7e99402aea7..af8b1fb384d 100644
--- a/statsmodels/sandbox/distributions/mv_normal.py
+++ b/statsmodels/sandbox/distributions/mv_normal.py
@@ -9,7 +9,7 @@
 
 TODO:
 * renaming,
-    - after adding t distribution, cov doesn't make sense for Sigma    DONE
+    - after adding t distribution, cov does not make sense for Sigma    DONE
     - should mean also be renamed to mu, if there will be distributions
       with mean != mu
 * not sure about corner cases
@@ -167,7 +167,7 @@ def expect_mc(dist, func=lambda x: 1, size=50000):
 
     Notes
     -----
-    this doesn't batch
+    this does not batch
 
     Returns
     -------
@@ -231,7 +231,7 @@ def expect_mc_bounds(dist, func=lambda x: 1, size=50000, lower=None, upper=None,
 
     Notes
     -----
-    this doesn't batch
+    this does not batch
 
     Returns
     -------
@@ -470,7 +470,7 @@ def logpdf(self, x):
 
         this should be made to work with 2d x,
         with multivariate normal vector in each row and iid across rows
-        doesn't work now because of dot in whiten
+        does not work now because of dot in whiten
 
         '''
 
@@ -519,7 +519,7 @@ def whiten(self, x):
 
         Notes
         -----
-        This only does rescaling, it doesn't subtract the mean, use standardize
+        This only does rescaling, it does not subtract the mean, use standardize
         for this instead
 
         See Also
@@ -739,7 +739,7 @@ def whiten(self, x):
 
         Notes
         -----
-        This only does rescaling, it doesn't subtract the mean, use standardize
+        This only does rescaling, it does not subtract the mean, use standardize
         for this instead
 
         See Also
@@ -811,7 +811,7 @@ def logpdf(self, x):
 
         this should be made to work with 2d x,
         with multivariate normal vector in each row and iid across rows
-        doesn't work now because of dot in whiten
+        does not work now because of dot in whiten
 
         '''
         x = np.asarray(x)
@@ -875,7 +875,7 @@ def logpdf(self, x):
 
         this should be made to work with 2d x,
         with multivariate normal vector in each row and iid across rows
-        doesn't work now because of dot in whiten
+        does not work now because of dot in whiten
 
         '''
         x = np.asarray(x)
@@ -1111,7 +1111,7 @@ def cdf(self, x, **kwds):
         #std_sigma = np.sqrt(np.diag(self.sigma))
         upper = (x - self.mean)/self.std_sigma
         return mvstdtprob(lower, upper, self.corr, self.df, **kwds)
-        #mvstdtcdf doesn't exist yet
+        #mvstdtcdf does not exist yet
         #return mvstdtcdf(lower, x, self.corr, df, **kwds)
 
     @property
diff --git a/statsmodels/sandbox/distributions/sppatch.py b/statsmodels/sandbox/distributions/sppatch.py
index 912b654f6f7..feea91bd023 100644
--- a/statsmodels/sandbox/distributions/sppatch.py
+++ b/statsmodels/sandbox/distributions/sppatch.py
@@ -17,7 +17,7 @@
 
 ########## patching scipy
 
-#vonmises doesn't define finite bounds, because it is intended for circular
+#vonmises does not define finite bounds, because it is intended for circular
 #support which does not define a proper pdf on the real line
 
 stats.distributions.vonmises.a = -np.pi
diff --git a/statsmodels/sandbox/distributions/tests/_est_fit.py b/statsmodels/sandbox/distributions/tests/_est_fit.py
index 7a9f8977d3f..0a5facd0229 100644
--- a/statsmodels/sandbox/distributions/tests/_est_fit.py
+++ b/statsmodels/sandbox/distributions/tests/_est_fit.py
@@ -1,5 +1,5 @@
 # NOTE: contains only one test, _est_cont_fit, that is renamed so that
-#       the test runner doesn't run it
+#       the test runner does not run it
 # I put this here for the record and for the case when someone wants to
 # verify the quality of fit
 # with current parameters: relatively small sample size, default starting values
@@ -14,7 +14,7 @@
 from .distparams import distcont
 
 # this is not a proper statistical test for convergence, but only
-# verifies that the estimate and true values don't differ by too much
+# verifies that the estimate and true values do not differ by too much
 n_repl1 = 1000 # sample size for first run
 n_repl2 = 5000 # sample size for second run, if first run fails
 thresh_percent = 0.25 # percent of true parameters for fail cut-off
@@ -25,7 +25,7 @@
 def _est_cont_fit():
     # this tests the closeness of the estimated parameters to the true
     # parameters with fit method of continuous distributions
-    # Note: is slow, some distributions don't converge with sample size <= 10000
+    # Note: is slow, some distributions do not converge with sample size <= 10000
 
     for distname, arg in distcont:
         yield check_cont_fit, distname,arg
diff --git a/statsmodels/sandbox/distributions/tests/test_norm_expan.py b/statsmodels/sandbox/distributions/tests/test_norm_expan.py
index 4efa49c9050..ad4e7010c3c 100644
--- a/statsmodels/sandbox/distributions/tests/test_norm_expan.py
+++ b/statsmodels/sandbox/distributions/tests/test_norm_expan.py
@@ -68,7 +68,7 @@ def setup_class(kls):
 
 
 class TestExpandNormSample(object):
-    # don't subclass CheckExpandNorm,
+    # do not subclass CheckExpandNorm,
     # precision not high enough because of mvsk from data
 
     @classmethod
diff --git a/statsmodels/sandbox/distributions/tests/test_transf.py b/statsmodels/sandbox/distributions/tests/test_transf.py
index d0ada29add5..60348f3bd98 100644
--- a/statsmodels/sandbox/distributions/tests/test_transf.py
+++ b/statsmodels/sandbox/distributions/tests/test_transf.py
@@ -90,7 +90,7 @@ def setup_class(cls):
             (squarenormalg, stats.chi2(1)),
             (absnormalg, stats.halfnorm),
             (absnormalg, stats.foldnorm(1e-5)),  #try frozen
-            #(negsquarenormalg, 1-stats.chi2),  # won't work as distribution
+            #(negsquarenormalg, 1-stats.chi2),  # will not work as distribution
             (squaretg(10), stats.f(1, 10))
         ]      #try both frozen
 
diff --git a/statsmodels/sandbox/distributions/try_pot.py b/statsmodels/sandbox/distributions/try_pot.py
index d5c82d90375..6a088423b26 100644
--- a/statsmodels/sandbox/distributions/try_pot.py
+++ b/statsmodels/sandbox/distributions/try_pot.py
@@ -20,10 +20,10 @@ def mean_residual_life(x, frac=None, alpha=0.05):
 
     TODO:
         check formula for std of mean
-        doesn't include case for all observations
+        does not include case for all observations
         last observations std is zero
         vectorize loop using cumsum
-        frac doesn't work yet
+        frac does not work yet
     '''
 
     axis = 0  # searchsorted is 1d only
@@ -42,7 +42,8 @@ def mean_residual_life(x, frac=None, alpha=0.05):
     for i in range(len(xthreshold)-1):
         k_ind = xlargerindex[i]
         rmean = x[k_ind:].mean()
-        rstd = x[k_ind:].std()  # this doesn't work for last observations, nans
+        # this does not work for last observations, nans
+        rstd = x[k_ind:].std()
         rmstd = rstd/np.sqrt(nobs-k_ind)  # std error of mean, check formula
         result.append((k_ind, xthreshold[i], rmean, rmstd))
 
diff --git a/statsmodels/sandbox/examples/bayesprior.py b/statsmodels/sandbox/examples/bayesprior.py
index dfd703d4ecf..c0e48959d58 100644
--- a/statsmodels/sandbox/examples/bayesprior.py
+++ b/statsmodels/sandbox/examples/bayesprior.py
@@ -114,7 +114,7 @@ def integ(x):
     theta1 = np.random.uniform(0,1)
     theta2 = np.random.normal(mu_, lambda_**2)
 #    mu = theta2/(1-theta1)
-#don't do this to maintain independence theta2 is the _location_
+#do not do this to maintain independence theta2 is the _location_
 #    y1 = np.random.normal(mu_, lambda_**2)
     y1 = theta2
 #    pmu_pairsp1[draw] = mu, theta1
@@ -180,7 +180,7 @@ def integ(x):
 
 #plt.show()
 
-#TODO: this doesn't look the same as the working paper?
+#TODO: this does not look the same as the working paper?
 #NOTE: but it matches the language?  I think mine is right!
 
 # Contour plots.
diff --git a/statsmodels/sandbox/examples/ex_cusum.py b/statsmodels/sandbox/examples/ex_cusum.py
index 4923eb2c6bd..feddac1643c 100644
--- a/statsmodels/sandbox/examples/ex_cusum.py
+++ b/statsmodels/sandbox/examples/ex_cusum.py
@@ -102,5 +102,5 @@
 ##check whether this works directly: Ploberger/Kramer framing of standard cusum
 ##no, it's different, there is another denominator
 #print breaks_cusumolsresid(rresid[skip:])
-#this function is still completely wrong, cut and paste doesn't apply
+#this function is still completely wrong, cut and paste does not apply
 #print breaks_cusum(rresid[skip:])
diff --git a/statsmodels/sandbox/examples/ex_gam_results.py b/statsmodels/sandbox/examples/ex_gam_results.py
index 1e5b0d47bf0..8c646e6b66b 100644
--- a/statsmodels/sandbox/examples/ex_gam_results.py
+++ b/statsmodels/sandbox/examples/ex_gam_results.py
@@ -7,7 +7,7 @@
 
 The example is loaded from a test module. The test still fails but the
 results look relatively good.
-I don't know yet why there is the small difference and why GAM doesn't
+I do not know yet why there is the small difference and why GAM does not
 converge in this case
 
 """
@@ -46,7 +46,7 @@
 plt.plot(comp2, 'b-')
 plt.plot(comp1_true, 'k--', lw=2)
 plt.plot(comp2_true, 'k--', lw=2)
-#the next doesn't make sense - non-linear
+#the next does not make sense - non-linear
 #c1 = tt.family.link(tt.family.link.inverse(comp1_true) + noise)
 #c2 = tt.family.link(tt.family.link.inverse(comp2_true) + noise)
 #not nice in example/plot: noise variance is constant not proportional
diff --git a/statsmodels/sandbox/examples/ex_mixed_lls_0.py b/statsmodels/sandbox/examples/ex_mixed_lls_0.py
index 70827172a78..92e1ffd1462 100644
--- a/statsmodels/sandbox/examples/ex_mixed_lls_0.py
+++ b/statsmodels/sandbox/examples/ex_mixed_lls_0.py
@@ -114,7 +114,7 @@
     >>> m.cov_random()
     array([[ 0.0348722 , -0.00909159],
            [-0.00909159,  0.26846254]])
-    >>> #note cov_random doesn't subtract mean!
+    >>> #note cov_random does not subtract mean!
     '''
     print('\nchecking the random effects distribution and prediction')
     gamma_re_true = np.array(gamma_re_true)
@@ -137,7 +137,7 @@
     print('rmse_perc (std)', rmse_perc)
     #from numpy.testing import assert_almost_equal
     #assert is for n_units=100 in original example
-    #I changed random number generation, so this won't work anymore
+    #I changed random number generation, so this will not work anymore
     #assert_almost_equal(rmse_perc, [ 34.14783884,  11.6031684 ], decimal=8)
 
     #now returns res
diff --git a/statsmodels/sandbox/examples/ex_mixed_lls_re.py b/statsmodels/sandbox/examples/ex_mixed_lls_re.py
index 8ea44b183fc..980e6d6828f 100644
--- a/statsmodels/sandbox/examples/ex_mixed_lls_re.py
+++ b/statsmodels/sandbox/examples/ex_mixed_lls_re.py
@@ -119,7 +119,7 @@
     >>> m.cov_random()
     array([[ 0.0348722 , -0.00909159],
            [-0.00909159,  0.26846254]])
-    >>> #note cov_random doesn't subtract mean!
+    >>> #note cov_random does not subtract mean!
     '''
     print('\nchecking the random effects distribution and prediction')
     gamma_re_true = np.array(gamma_re_true)
@@ -142,7 +142,7 @@
     print('rmse_perc (std)', rmse_perc)
     #from numpy.testing import assert_almost_equal
     #assert is for n_units=100 in original example
-    #I changed random number generation, so this won't work anymore
+    #I changed random number generation, so this will not work anymore
     #assert_almost_equal(rmse_perc, [ 34.14783884,  11.6031684 ], decimal=8)
 
     #now returns res
diff --git a/statsmodels/sandbox/examples/ex_mixed_lls_timecorr.py b/statsmodels/sandbox/examples/ex_mixed_lls_timecorr.py
index 2b88c120cb9..ed64d3a05c1 100644
--- a/statsmodels/sandbox/examples/ex_mixed_lls_timecorr.py
+++ b/statsmodels/sandbox/examples/ex_mixed_lls_timecorr.py
@@ -23,15 +23,15 @@
 
 I needed to remove the first observation from the time dummies to avoid a
 singular matrix. So, interpretation of time effects should be relative to
-first observation. (I didn't check the math.)
+first observation. (I did not check the math.)
 TODO:
-Note, I don't already have constant in X. Constant for first
+Note, I do not already have constant in X. Constant for first
 time observation is missing.
 Do I need all dummies in exog_fe, Z, but not in exog_re, Z?  Tried this and
 it works.
 In the error decomposition we also have the noise variable, I guess this works
 like constant, so we get full rank (square) with only T-1 time dummies.
-But we don't get correlation with the noise, or do we? conditional?
+But we do not get correlation with the noise, or do we? conditional?
 -> sample correlation of estimated random effects looks a bit high,
    upward bias? or still some problems with initial condition?
    correlation from estimated cov_random looks good.
@@ -43,7 +43,7 @@
 Intertemporal correlation in data generating process, DGP, to see if
 the results correctly estimate it.
 used AR(1) as example, but only starting at second period. (?)
-Note: we don't impose AR structure in the estimation
+Note: we do not impose AR structure in the estimation
 
 """
 
@@ -168,7 +168,7 @@
     >>> m.cov_random()
     array([[ 0.0348722 , -0.00909159],
            [-0.00909159,  0.26846254]])
-    >>> #note cov_random doesn't subtract mean!
+    >>> #note cov_random does not subtract mean!
     '''
     print('\nchecking the random effects distribution and prediction')
     gamma_re_true = np.array(gamma_re_true)
@@ -191,7 +191,7 @@
     print('rmse_perc (std)', rmse_perc)
     #from numpy.testing import assert_almost_equal
     #assert is for n_units=100 in original example
-    #I changed random number generation, so this won't work anymore
+    #I changed random number generation, so this will not work anymore
     #assert_almost_equal(rmse_perc, [ 34.14783884,  11.6031684 ], decimal=8)
 
     #now returns res
diff --git a/statsmodels/sandbox/examples/ex_random_panel.py b/statsmodels/sandbox/examples/ex_random_panel.py
index a52eb833887..a555b531f26 100644
--- a/statsmodels/sandbox/examples/ex_random_panel.py
+++ b/statsmodels/sandbox/examples/ex_random_panel.py
@@ -57,7 +57,7 @@
     #OLS standard errors are too small
     mod.res_pooled.params
     mod.res_pooled.bse
-    #heteroscedasticity robust doesn't help
+    #heteroscedasticity robust does not help
     mod.res_pooled.HC1_se
     #compare with cluster robust se
 
@@ -131,7 +131,7 @@
 
 
     import pandas as pa
-    #pandas.DataFrame doesn't do inplace append
+    #pandas.DataFrame does not do inplace append
     se = pa.DataFrame(res_ols.bse[None,:], index=['OLS'])
     se = se.append(pa.DataFrame(res5.bse[None,:], index=['PGLSit5']))
     clbse = sw.se_cov(sw.cov_cluster(mod.res_pooled, dgp.groups.astype(int)))
diff --git a/statsmodels/sandbox/examples/example_gam_0.py b/statsmodels/sandbox/examples/example_gam_0.py
index 3afe59ece05..1d6e3c9cb75 100644
--- a/statsmodels/sandbox/examples/example_gam_0.py
+++ b/statsmodels/sandbox/examples/example_gam_0.py
@@ -50,7 +50,7 @@
 z -= np.median(z)
 print('z.std()', z.std())
 #z = standardize(z) + 0.2
-# with standardize I get better values, but I don't know what the true params are
+# with standardize I get better values, but I do not know what the true params are
 print(z.mean(), z.min(), z.max())
 
 #y += z  #noise
diff --git a/statsmodels/sandbox/examples/example_mle.py b/statsmodels/sandbox/examples/example_mle.py
index f013b6fd14a..c6795d87526 100644
--- a/statsmodels/sandbox/examples/example_mle.py
+++ b/statsmodels/sandbox/examples/example_mle.py
@@ -49,7 +49,7 @@
 #resfmin2 = optimize.fmin(f, mod.results.params*0.9, maxfun=5000, maxiter=5000, xtol=1e-10, ftol= 1e-10)
 resfmin2 = optimize.fmin(f, np.ones(7), maxfun=5000, maxiter=5000, xtol=1e-10, ftol= 1e-10)
 print(resfmin2)
-# there isn't a unique solution?  Is this due to the multicollinearity? Improved with use of analytically
+# there is not a unique solution?  Is this due to the multicollinearity? Improved with use of analytically
 # defined score function?
 
 #check X'X matrix
diff --git a/statsmodels/sandbox/examples/example_nbin.py b/statsmodels/sandbox/examples/example_nbin.py
index d2b1ea519c0..f49c407c471 100644
--- a/statsmodels/sandbox/examples/example_nbin.py
+++ b/statsmodels/sandbox/examples/example_nbin.py
@@ -40,7 +40,7 @@ def _ll_nbp(y, X, beta, alph, Q):
 
     References:
 
-    Greene, W. 2008. "Functional forms for the negtive binomial model
+    Greene, W. 2008. "Functional forms for the negative binomial model
         for count data". Economics Letters. Volume 99, Number 3, pp.585-590.
     Hilbe, J.M. 2011. "Negative binomial regression". Cambridge University Press.
 
diff --git a/statsmodels/sandbox/examples/run_all.py b/statsmodels/sandbox/examples/run_all.py
index 85ebff62969..f5bbb488279 100644
--- a/statsmodels/sandbox/examples/run_all.py
+++ b/statsmodels/sandbox/examples/run_all.py
@@ -1,4 +1,4 @@
-'''run all examples to make sure we don't get an exception
+'''run all examples to make sure we do not get an exception
 
 Note:
 If an example contaings plt.show(), then all plot windows have to be closed
@@ -33,4 +33,4 @@
                 raise
 #plt.show()
 #plt.close('all')
-#close doesn't work because I never get here without closing plots manually
+#close does not work because I never get here without closing plots manually
diff --git a/statsmodels/sandbox/examples/thirdparty/findow_1.py b/statsmodels/sandbox/examples/thirdparty/findow_1.py
index 5cc49bcf0df..b2ec6108623 100644
--- a/statsmodels/sandbox/examples/thirdparty/findow_1.py
+++ b/statsmodels/sandbox/examples/thirdparty/findow_1.py
@@ -9,7 +9,7 @@
 colors of lines in graphs are not great
 
 uses DataFrame and WidePanel to hold data downloaded from yahoo using matplotlib.
-I haven't figured out storage, so the download happens at each run
+I have not figured out storage, so the download happens at each run
 of the script.
 
 Created on Sat Jan 30 16:30:18 2010
diff --git a/statsmodels/sandbox/examples/try_gmm_other.py b/statsmodels/sandbox/examples/try_gmm_other.py
index 3cc3fdc7f8f..094845f55bc 100644
--- a/statsmodels/sandbox/examples/try_gmm_other.py
+++ b/statsmodels/sandbox/examples/try_gmm_other.py
@@ -118,7 +118,7 @@ def sample_ivfake(exog):
         x0p = [1., gparrvs.min()-5, 1]
 
         moddist = gmm.DistQuantilesGMM(gparrvs, None, None, distfn=stats.genpareto)
-        #produces non-sense because optimal weighting matrix calculations don't
+        #produces non-sense because optimal weighting matrix calculations do not
         #apply to this case
         #resgp = moddist.fit() #now with 'cov': LinAlgError: Singular matrix
         pit1, wit1 = moddist.fititer([1.5,0,1.5], maxiter=1)
diff --git a/statsmodels/sandbox/gam.py b/statsmodels/sandbox/gam.py
index 3d6576c67b9..b579fe98a10 100644
--- a/statsmodels/sandbox/gam.py
+++ b/statsmodels/sandbox/gam.py
@@ -17,7 +17,7 @@
 - testfailure with Gamma, no other families tested
 - there is still an indeterminacy in the split up of the constant across
   components (smoothers) and alpha, sum, i.e. constant, looks good.
-  - role of offset, that I haven't tried to figure out yet
+  - role of offset, that I have not tried to figure out yet
 
 Refactoring
 -----------
@@ -39,7 +39,7 @@
 # TODO: check/catalogue required interface of a smoother
 # TODO: replace default smoother by corresponding function to initialize
 #       other smoothers
-# TODO: fix iteration, don't define class with iterator methods, use looping;
+# TODO: fix iteration, do not define class with iterator methods, use looping;
 #       add maximum iteration and other optional stop criteria
 # fixed some of the dimension problems in PolySmoother,
 #       now graph for example looks good
@@ -110,7 +110,7 @@ class Results(object):
     def __init__(self, Y, alpha, exog, smoothers, family, offset):
         self.nobs, self.k_vars = exog.shape  #assumes exog is 2d
         #weird: If I put the previous line after the definition of self.mu,
-        #    then the attributed don't get added
+        #    then the attributed do not get added
         self.Y = Y
         self.alpha = alpha
         self.smoothers = smoothers
@@ -141,7 +141,7 @@ def predict(self, exog):
         #TODO: transpose in smoothed and sum over axis=1
 
         #BUG: there is some inconsistent orientation somewhere
-        #temporary hack, won't work for 1d
+        #temporary hack, will not work for 1d
         #print dir(self)
         #print 'self.nobs, self.k_vars', self.nobs, self.k_vars
         exog_smoothed = self.smoothed(exog)
@@ -162,9 +162,9 @@ def smoothed(self, exog):
         '''
         #bug: with exog in predict I get a shape error
         #print 'smoothed', exog.shape, self.smoothers[0].predict(exog).shape
-        #there was a mistake exog didn't have column index i
+        #there was a mistake exog did not have column index i
         return np.array([self.smoothers[i].predict(exog[:,i]) + self.offset[i]
-        #shouldn't be a mistake because exog[:,i] is attached to smoother, but
+        #should not be a mistake because exog[:,i] is attached to smoother, but
         #it is for different exog
         #return np.array([self.smoothers[i].predict() + self.offset[i]
                          for i in range(exog.shape[1])]).T
@@ -221,7 +221,7 @@ def next(self):
         '''internal calculation for one fit iteration
 
         BUG: I think this does not improve, what is supposed to improve
-            offset doesn't seem to be used, neither an old alpha
+            offset does not seem to be used, neither an old alpha
             The smoothers keep coef/params from previous iteration
         '''
         _results = self.results
@@ -331,7 +331,7 @@ def fit(self, Y, rtol=1.0e-06, maxiter=30):
 class Model(GLM, AdditiveModel):
 #class Model(AdditiveModel):
     #TODO: what does GLM do? Is it actually used ?
-    #only used in __init__, dropping it doesn't change results
+    #only used in __init__, dropping it does not change results
     #but where gets family attached now? - weird, it's Gaussian in this case now
     #also where is the link defined?
     #AdditiveModel overwrites family and sets it to Gaussian - corrected
@@ -375,7 +375,7 @@ def next(self):
         m = AdditiveModel(self.exog, smoothers=self.smoothers,
                           weights=self.weights, family=self.family)
 
-        #TODO: I don't know what the next two lines do, Z, Y ? which is endog?
+        #TODO: I do not know what the next two lines do, Z, Y ? which is endog?
         #Y is original endog, Z is endog for the next step in the iterative solver
 
         _results = m.fit(Z)
diff --git a/statsmodels/sandbox/infotheo.py b/statsmodels/sandbox/infotheo.py
index 8cb2a9e0488..e4c1d5541bd 100644
--- a/statsmodels/sandbox/infotheo.py
+++ b/statsmodels/sandbox/infotheo.py
@@ -179,7 +179,7 @@ def shannonentropy(px, logbase=2):
     -----
     shannonentropy(0) is defined as 0
     """
-#TODO: haven't defined the px,py case?
+#TODO: have not defined the px,py case?
     px = np.asarray(px)
     if not np.all(px <= 1) or not np.all(px >= 0):
         raise ValueError("px does not define proper distribution")
diff --git a/statsmodels/sandbox/mcevaluate/mcresuts_arma1.txt b/statsmodels/sandbox/mcevaluate/mcresuts_arma1.txt
index 6bf97da1e23..6f4a281cbed 100644
--- a/statsmodels/sandbox/mcevaluate/mcresuts_arma1.txt
+++ b/statsmodels/sandbox/mcevaluate/mcresuts_arma1.txt
@@ -6,7 +6,7 @@ Comments:
 ---------
 scikits.statsmodels.tsa.arma_mle.Arma.fit((2,0,2))
 niter=100
-didn't use seed
+did not use seed
 some strange inf in median bias percent and
 positive error fraction equal to 1
 
diff --git a/statsmodels/sandbox/multilinear.py b/statsmodels/sandbox/multilinear.py
index 35a2288238a..2f7a448daa1 100644
--- a/statsmodels/sandbox/multilinear.py
+++ b/statsmodels/sandbox/multilinear.py
@@ -142,7 +142,7 @@ def multiOLS(model, dataframe, column_list=None, method='fdr_bh',
     >>> multiOLS('GNP + 0', df, 'GNPDEFL')
     """
     # data normalization
-    # if None take all the numerical columns that aren't present in the model
+    # if None take all the numerical columns that are not present in the model
     # it's not waterproof but is a good enough criterion for everyday use
     if column_list is None:
         column_list = [name for name in dataframe.columns
diff --git a/statsmodels/sandbox/nonparametric/densityorthopoly.py b/statsmodels/sandbox/nonparametric/densityorthopoly.py
index 609fe065d15..ef4b6102a7b 100644
--- a/statsmodels/sandbox/nonparametric/densityorthopoly.py
+++ b/statsmodels/sandbox/nonparametric/densityorthopoly.py
@@ -15,7 +15,7 @@
 TODO:
 
 * check fourier case again:  base is orthonormal,
-  but needs offsetfact = 0 and doesn't integrate to 1, rescaled looks good
+  but needs offsetfact = 0 and does not integrate to 1, rescaled looks good
 * hermite: works but DensityOrthoPoly requires currently finite bounds
   I use it with offsettfactor 0.5 in example
 * not implemented methods:
@@ -47,7 +47,7 @@
 class FPoly(object):
     '''Orthonormal (for weight=1) Fourier Polynomial on [0,1]
 
-    orthonormal polynomial but density needs corfactor that I don't see what
+    orthonormal polynomial but density needs corfactor that I do not see what
     it is analytically
 
     parameterization on [0,1] from
@@ -72,7 +72,7 @@ def __call__(self, x):
 class F2Poly(object):
     '''Orthogonal (for weight=1) Fourier Polynomial on [0,pi]
 
-    is orthogonal but first component doesn't square-integrate to 1
+    is orthogonal but first component does not square-integrate to 1
     final result seems to need a correction factor of sqrt(pi)
     _corfactor = sqrt(pi) from integrating the density
 
@@ -381,7 +381,7 @@ def _transform(self, x): # limits=None):
         '''
 
         #use domain from first instance
-        #class doesn't have domain  self.polybase.domain[0] AttributeError
+        #class does not have domain  self.polybase.domain[0] AttributeError
         domain = self.polys[0].domain
 
         ilen = (domain[1] - domain[0])
diff --git a/statsmodels/sandbox/nonparametric/kdecovclass.py b/statsmodels/sandbox/nonparametric/kdecovclass.py
index 24e01793233..7919ae617cb 100644
--- a/statsmodels/sandbox/nonparametric/kdecovclass.py
+++ b/statsmodels/sandbox/nonparametric/kdecovclass.py
@@ -76,7 +76,7 @@ def test_kde_1d():
     # get kde for original sample
     gkde = stats.gaussian_kde(xn)
 
-    # evaluate the density funtion for the kde for some points
+    # evaluate the density function for the kde for some points
     xs = np.linspace(-7,7,501)
     kdepdf = gkde.evaluate(xs)
     normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
@@ -118,7 +118,7 @@ def test_kde_1d():
     # get kde for original sample
     #gkde = stats.gaussian_kde(xn)
     gkde = gaussian_kde_covfact(xn, 0.1)
-    # evaluate the density funtion for the kde for some points
+    # evaluate the density function for the kde for some points
     ind = np.linspace(-7,7,101)
     kdepdf = gkde.evaluate(ind)
 
diff --git a/statsmodels/sandbox/nonparametric/kernels.py b/statsmodels/sandbox/nonparametric/kernels.py
index bcced61f54f..7419b9273ca 100644
--- a/statsmodels/sandbox/nonparametric/kernels.py
+++ b/statsmodels/sandbox/nonparametric/kernels.py
@@ -74,7 +74,7 @@ def density(self, xs, x):
 
         if len(xs)>0:  ## Need to do product of marginal distributions
             #w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
-            #vectorized doesn't work:
+            #vectorized does not work:
             if self.weights is not None:
                 w = np.mean(self((xs-x) * self._Hrootinv).T * self.weights)/sum(self.weights)
             else:
@@ -120,7 +120,7 @@ def __init__(self, shape, h = 1.0, domain = None, norm = None):
         """
         shape should be a function taking and returning numeric type.
 
-        For sanity it should always return positive or zero but this isn't
+        For sanity it should always return positive or zero but this is not
         enforced in case you want to do weird things. Bear in mind that the
         statistical tests etc. may not be valid for non-positive kernels.
 
diff --git a/statsmodels/sandbox/nonparametric/smoothers.py b/statsmodels/sandbox/nonparametric/smoothers.py
index cc75471e0ea..a576ba78903 100644
--- a/statsmodels/sandbox/nonparametric/smoothers.py
+++ b/statsmodels/sandbox/nonparametric/smoothers.py
@@ -61,7 +61,7 @@ def conf(self, x):
         confidence.
         These bounds are based on variance only, and do not include the bias.
         If the bandwidth is much larger than the curvature of the underlying
-        funtion then the bias could be large.
+        function then the bias could be large.
 
         x is the points on which you want to evaluate the fit and the errors.
 
@@ -100,7 +100,7 @@ class PolySmoother(object):
     """
     #JP: heavily adjusted to work as plugin replacement for bspline
     #   smoother in gam.py  initialized by function default_smoother
-    #   Only fixed exceptions, I didn't check whether it is statistically
+    #   Only fixed exceptions, I did not check whether it is statistically
     #   correctand I think it is not, there are still be some dimension
     #   problems, and there were some dimension problems initially.
     # TODO: undo adjustments and fix dimensions correctly
@@ -228,7 +228,7 @@ def fit(self, y, x=None, weights=None):
 ##        if x is None:
 ##            x = self.tau[(self.M-1):-(self.M-1)] # internal knots
 ##
-##        if pen == 0.: # can't use cholesky for singular matrices
+##        if pen == 0.: # cannot use cholesky for singular matrices
 ##            banded = False
 ##
 ##        if x.shape != y.shape:
diff --git a/statsmodels/sandbox/nonparametric/tests/test_kernel_extras.py b/statsmodels/sandbox/nonparametric/tests/test_kernel_extras.py
index b1bf0baac27..6287a1f4b08 100644
--- a/statsmodels/sandbox/nonparametric/tests/test_kernel_extras.py
+++ b/statsmodels/sandbox/nonparametric/tests/test_kernel_extras.py
@@ -73,6 +73,6 @@ def test_basic(self):
                            var_type='c', k_linear=1)
         b_hat = np.squeeze(model.b)
         # Only tests for the linear part of the regression
-        # Currently doesn't work well with the nonparametric part
+        # Currently does not work well with the nonparametric part
         # Needs some more work
         npt.assert_allclose(b1, b_hat, rtol=0.1)
diff --git a/statsmodels/sandbox/panel/mixed.py b/statsmodels/sandbox/panel/mixed.py
index 3d4ee7ebfdb..2dff7930484 100644
--- a/statsmodels/sandbox/panel/mixed.py
+++ b/statsmodels/sandbox/panel/mixed.py
@@ -233,7 +233,7 @@ class OneWayMixed(object):
 
     convergence criteria for iteration
     Currently convergence in the iterative solver is reached if either the loglikelihood
-    *or* the fixed effects parameter don't change above tolerance.
+    *or* the fixed effects parameter do not change above tolerance.
 
     In some examples, the fixed effects parameters converged to 1e-5 within 150 iterations
     while the log likelihood did not converge within 2000 iterations. This might be
@@ -402,7 +402,7 @@ def logL(self, ML=False):
         Return log-likelihood, REML by default.
 
         """
-        #I don't know what the difference between REML and ML is here.
+        #I do not know what the difference between REML and ML is here.
         logL = 0.
 
         for unit in self.units:
@@ -502,7 +502,7 @@ def __init__(self, model):
         self.params = model.params
 
 
-    #need to overwrite this because we don't have a standard
+    #need to overwrite this because we do not have a standard
     #model.loglike yet
     #TODO: what todo about REML loglike, logL is not normalized
     @cache_readonly
diff --git a/statsmodels/sandbox/panel/panel_short.py b/statsmodels/sandbox/panel/panel_short.py
index c4f9472f190..147f3c18676 100644
--- a/statsmodels/sandbox/panel/panel_short.py
+++ b/statsmodels/sandbox/panel/panel_short.py
@@ -198,7 +198,7 @@ def fit_iterative(self, maxiter=3):
         redundant recalculations (whitening or calculating pinv_wexog).
 
         """
-        #Note: in contrast to GLSHet, we don't have an auxiliary regression here
+        #Note: in contrast to GLSHet, we do not have an auxiliary regression here
         #      might be needed if there is more structure in cov_i
 
         #because we only have the loop we are not attaching the ols_pooled
diff --git a/statsmodels/sandbox/panel/panelmod.py b/statsmodels/sandbox/panel/panelmod.py
index d7281b9bae6..8821673218a 100644
--- a/statsmodels/sandbox/panel/panelmod.py
+++ b/statsmodels/sandbox/panel/panelmod.py
@@ -73,7 +73,7 @@ def repanel_cov(groups, sigmas):
         groupuniq = np.unique(group)
         dummygr = sigmas[igr] * (group == groupuniq).astype(float)
         omega +=  np.dot(dummygr, dummygr.T)
-    ev, evec = np.linalg.eigh(omega)  #eig doesn't work
+    ev, evec = np.linalg.eigh(omega)  #eig does not work
     omegainv = np.dot(evec, (1/ev * evec).T)
     omegainvhalf = evec/np.sqrt(ev)
     return omega, omegainv, omegainvhalf
@@ -180,7 +180,7 @@ def initialize(self, endog, exog, panel, time, xtnames, equation):
 # on the pandas LongPanel structure for speed and convenience.
 # not sure this part is finished...
 
-#TODO: doesn't conform to new initialize
+#TODO: does not conform to new initialize
     def initialize_pandas(self, panel_data, endog_name, exog_name):
         self.panel_data = panel_data
         endog = panel_data[endog_name].values # does this create a copy?
@@ -338,7 +338,7 @@ class DynamicPanel(PanelModel):
     import numpy.lib.recfunctions as nprf
 
     data = sm.datasets.grunfeld.load(as_pandas=False)
-    # Baltagi doesn't include American Steel
+    # Baltagi does not include American Steel
     endog = data.endog[:-20]
     fullexog = data.exog[:-20]
 #    fullexog.sort(order=['firm','year'])
@@ -356,7 +356,7 @@ class DynamicPanel(PanelModel):
     year = fullexog['year']
     panel_mod = PanelModel(endog, exog, panel, year, xtnames=['firm','year'],
             equation='invest value capital')
-# note that equation doesn't actually do anything but name the variables
+# note that equation does not actually do anything but name the variables
     panel_ols = panel_mod.fit(model='pooled')
 
     panel_be = panel_mod.fit(model='between', effects='oneway')
@@ -404,7 +404,7 @@ class DynamicPanel(PanelModel):
     omega = np.dot(dummyall, dummyall.T) + sigma* np.eye(nobs)
     print(omega)
     print(np.linalg.cholesky(omega))
-    ev, evec = np.linalg.eigh(omega)  #eig doesn't work
+    ev, evec = np.linalg.eigh(omega)  #eig does not work
     omegainv = np.dot(evec, (1/ev * evec).T)
     omegainv2 = np.linalg.inv(omega)
     omegacomp = np.dot(evec, (ev * evec).T)
@@ -412,7 +412,7 @@ class DynamicPanel(PanelModel):
     #check
     #print(np.dot(omegainv,omega)
     print(np.max(np.abs(np.dot(omegainv,omega) - np.eye(nobs))))
-    omegainvhalf = evec/np.sqrt(ev)  #not sure whether ev shouldn't be column
+    omegainvhalf = evec/np.sqrt(ev)  #not sure whether ev should not be column
     print(np.max(np.abs(np.dot(omegainvhalf,omegainvhalf.T) - omegainv)))
 
     # now we can use omegainvhalf in GLS (instead of the cholesky)
diff --git a/statsmodels/sandbox/pca.py b/statsmodels/sandbox/pca.py
index d18afd0cb9e..dba7c8f18de 100644
--- a/statsmodels/sandbox/pca.py
+++ b/statsmodels/sandbox/pca.py
@@ -154,7 +154,7 @@ def project(self,vals=None,enthresh=None,nPCs=None,cumen=None):
         if nonnones == 0:
             m = slice(None)
         elif nonnones > 1:
-            raise ValueError("can't specify more than one threshold")
+            raise ValueError("cannot specify more than one threshold")
         else:
             if enthresh is not None:
                 m = self.energies() > enthresh
@@ -170,7 +170,7 @@ def project(self,vals=None,enthresh=None,nPCs=None,cumen=None):
         else:
             vals = np.array(vals,copy=False)
             if self.N.T.shape[0] != vals.shape[0]:
-                raise ValueError("shape for vals doesn't match")
+                raise ValueError("shape for vals does not match")
         proj = np.matrix(self.getEigenvectors()).T*vals
         return proj[m].T
 
@@ -212,7 +212,7 @@ def subtractPC(self,pc,vals=None):
         else:
             vals = vals.T
             if vals.shape[1]!= self.A.shape[1]:
-                raise ValueError("vals don't have the correct number of components")
+                raise ValueError("vals do not have the correct number of components")
 
         pcs=self.project()
         zpcs=np.zeros_like(pcs)
diff --git a/statsmodels/sandbox/regression/ar_panel.py b/statsmodels/sandbox/regression/ar_panel.py
index cbf8b271f94..e46ae092b1b 100644
--- a/statsmodels/sandbox/regression/ar_panel.py
+++ b/statsmodels/sandbox/regression/ar_panel.py
@@ -74,7 +74,7 @@ def fit(self):
 
     y00 = 0.5*np.random.randn(nobs+1)
 
-    # I don't think a trend is handled yet
+    # I do not think a trend is handled yet
     data = np.arange(nobs) + y00[1:] + 0.2*y00[:-1] + 0.1*np.random.randn(nobs)
     #Are these AR(1) or MA(1) errors ???
     data = y00[1:] + 0.6*y00[:-1] #+ 0.1*np.random.randn(nobs)
@@ -106,7 +106,7 @@ def fit(self):
     exog = np.ones(nobs)
     exog = group_dummy
     mod = PanelAR1(y, exog, groups=groups)
-    #mod = PanelAR1(data, exog, groups=groups) #data doesn't contain different means
+    #mod = PanelAR1(data, exog, groups=groups) #data does not contain different means
     #print(mod.ar1filter(mod.endog, 1))
     resa, reso = mod.fit()
     print(resa[0], reso.params)
diff --git a/statsmodels/sandbox/regression/gmm.py b/statsmodels/sandbox/regression/gmm.py
index d7adad7078c..095cb9d03a8 100644
--- a/statsmodels/sandbox/regression/gmm.py
+++ b/statsmodels/sandbox/regression/gmm.py
@@ -18,7 +18,7 @@
      but not tested yet.
   DONE `fitonce` in DistQuantilesGMM, params are the same as in direct call to fitgmm
       move it to GMM class (once it's clearer for which cases I need this.)
-* GMM doesn't know anything about the underlying model, e.g. y = X beta + u or panel
+* GMM does not know anything about the underlying model, e.g. y = X beta + u or panel
   data model. It would be good if we can reuse methods from regressions, e.g.
   predict, fitted values, calculating the error term, and some result statistics.
   What's the best way to do this, multiple inheritance, outsourcing the functions,
@@ -37,7 +37,7 @@
     jval in jtest looks to large in example, but I have no idea about the size
 * bse for fitonce look too large (no time for checking now)
     formula for calc_cov_params for the case without optimal weighting matrix
-    is wrong. I don't have an estimate for omega in that case. And I'm confusing
+    is wrong. I do not have an estimate for omega in that case. And I'm confusing
     between weights and omega, which are *not* the same in this case.
 
 
@@ -129,7 +129,7 @@ def fit(self):
         linear models.
 
         Parameter estimates and covariance are correct, but other results
-        haven't been tested yet, to seee whether they apply without changes.
+        have not been tested yet, to see whether they apply without changes.
 
         '''
         #Greene 5th edt., p.78 section 5.4
@@ -230,7 +230,7 @@ def spec_hausman(self, dof=None):
         endog, exog = self.model.endog, self.model.exog
         resols = OLS(endog, exog).fit()
         normalized_cov_params_ols = resols.model.normalized_cov_params
-        # Stata `ivendog` doesn't use df correction for se
+        # Stata `ivendog` does not use df correction for se
         #se2 = resols.mse_resid #* resols.df_resid * 1. / len(endog)
         se2 = resols.ssr / len(endog)
 
@@ -304,7 +304,7 @@ def summary(self, yname=None, xname=None, title=None, alpha=.05):
 
         #TODO: requiring list/iterable is a bit annoying
         #need more control over formatting
-        #TODO: default don't work if it's not identically spelled
+        #TODO: default do not work if it's not identically spelled
 
         top_left = [('Dep. Variable:', None),
                     ('Model:', None),
@@ -436,7 +436,7 @@ class GMM(Model):
         array of instruments, see notes
     nmoms : None or int
         number of moment conditions, if None then it is set equal to the
-        number of columns of instruments. Mainly needed to determin the shape
+        number of columns of instruments. Mainly needed to determine the shape
         or size of start parameters and starting weighting matrix.
     kwds : anything
         this is mainly if additional variables need to be stored for the
@@ -479,7 +479,7 @@ def __init__(self, endog, exog, instrument, k_moms=None, k_params=None,
         '''
         maybe drop and use mixin instead
 
-        TODO: GMM doesn't really care about the data, just the moment conditions
+        TODO: GMM does not really care about the data, just the moment conditions
         '''
         instrument = self._check_inputs(instrument, endog) # attaches if needed
         super(GMM, self).__init__(endog, exog, missing=missing,
@@ -982,7 +982,7 @@ def calc_weightmatrix(self, moms, weights_method='cov', wargs=(),
 
         centered = not ('centered' in wargs and not wargs['centered'])
         if not centered:
-            # caller doesn't want centered moment conditions
+            # caller does not want centered moment conditions
             moms_ = moms
         else:
             moms_ = moms - moms.mean()
@@ -990,7 +990,7 @@ def calc_weightmatrix(self, moms, weights_method='cov', wargs=(),
         # TODO: store this outside to avoid doing this inside optimization loop
         # TODO: subclasses need to be able to add weights_methods, and remove
         #       IVGMM can have homoscedastic (OLS),
-        #       some options won't make sense in some cases
+        #       some options will not make sense in some cases
         #       possible add all here and allow subclasses to define a list
         # TODO: should other weights_methods also have `ddof`
         if weights_method == 'cov':
@@ -1033,14 +1033,14 @@ def calc_weightmatrix(self, moms, weights_method='cov', wargs=(),
 
         elif weights_method == 'iid':
             # only when we have instruments and residual mom = Z * u
-            # TODO: problem we don't have params in argument
+            # TODO: problem we do not have params in argument
             #       I cannot keep everything in here w/o params as argument
             u = self.get_error(params)
 
             if centered:
                 # Note: I'm not centering instruments,
-                #    shouldn't we always center u? Ok, with centered as default
-                u -= u.mean(0)  #demean inplace, we don't need original u
+                #    should not we always center u? Ok, with centered as default
+                u -= u.mean(0)  #demean inplace, we do not need original u
 
             instrument = self.instrument
             w = np.dot(instrument.T, instrument).dot(np.dot(u.T, u)) / nobs
@@ -1148,14 +1148,14 @@ def _cov_params(self, **kwds):
         # this should use by default whatever options have been specified in
         # fit
 
-        # TODO: don't do this when we want to change options
+        # TODO: do not do this when we want to change options
 #         if hasattr(self, '_cov_params'):
 #             #replace with decorator later
 #             return self._cov_params
 
         # set defaults based on fit arguments
         if 'wargs' not in kwds:
-            # Note: we don't check the keys in wargs, use either all or nothing
+            # Note: we do not check the keys in wargs, use either all or nothing
             kwds['wargs'] = self.wargs
         if 'weights_method' not in kwds:
             kwds['weights_method'] = self.options_other['weights_method']
@@ -1702,7 +1702,7 @@ def __init__(self, endog, exog, instrument, **kwds):
         self.epsilon_iter = 1e-5
 
         self.distfn = kwds['distfn']
-        #done by super doesn't work yet
+        #done by super does not work yet
         #TypeError: super does not take keyword arguments
         self.endog = endog
 
@@ -1802,7 +1802,7 @@ def fitonce(self, start=None, weights=None, has_optimal_weights=False):
         self.results.params = params  #required before call to self.cov_params
         self.results.wargs = {} #required before call to self.cov_params
         self.results.options_other = {'weights_method':'cov'}
-        # TODO: which weights_method?  There shouldn't be any needed ?
+        # TODO: which weights_method?  There should not be any needed ?
         _cov_params = self.results.cov_params(weights=weights,
                                       has_optimal_weights=has_optimal_weights)
 
diff --git a/statsmodels/sandbox/regression/ols_anova_original.py b/statsmodels/sandbox/regression/ols_anova_original.py
index 6303a8aa00e..bef2b0f7d5a 100644
--- a/statsmodels/sandbox/regression/ols_anova_original.py
+++ b/statsmodels/sandbox/regression/ols_anova_original.py
@@ -206,7 +206,7 @@ def form2design(ss, data):
     Notes
     -----
 
-    with sorted dict, separate name list wouldn't be necessary
+    with sorted dict, separate name list would not be necessary
     '''
     vars = {}
     names = []
@@ -294,7 +294,7 @@ def dropname(ss, li):
 m = dta.mask.view(bool)
 droprows = m.reshape(-1,len(dta.dtype.names)).any(1)
 # get complete data as plain structured array
-# maybe doesn't work with masked arrays
+# maybe does not work with masked arrays
 dta_use_b1 = dta[~droprows,:].data
 print(dta_use_b1.shape)
 print(dta_use_b1.dtype)
diff --git a/statsmodels/sandbox/regression/penalized.py b/statsmodels/sandbox/regression/penalized.py
index 7ca43ac627b..29484447935 100644
--- a/statsmodels/sandbox/regression/penalized.py
+++ b/statsmodels/sandbox/regression/penalized.py
@@ -22,12 +22,12 @@
 
 
 there is something fishy with the result instance, some things, e.g.
-normalized_cov_params, don't look like they update correctly as we
+normalized_cov_params, do not look like they update correctly as we
 search over lambda -> some stale state again ?
 
 I added df_model to result class using the hatmatrix, but df_model is defined
 in model instance not in result instance. -> not clear where refactoring should
-occur. df_resid doesn't get updated correctly.
+occur. df_resid does not get updated correctly.
 problem with definition of df_model, it has 1 subtracted for constant
 
 
@@ -98,7 +98,7 @@ class TheilGLS(GLS):
 
     Baum, Christopher slides for tgmixed in Stata
 
-    (I don't remember what I used when I first wrote the code.)
+    (I do not remember what I used when I first wrote the code.)
 
     Parameters
     ----------
@@ -283,7 +283,7 @@ def select_pen_weight(self, method='aicc', start_params=1., optim_args=None):
         if optim_args is None:
             optim_args = {}
 
-        #this doesn't make sense, since number of parameters stays unchanged
+        #this does not make sense, since number of parameters stays unchanged
         # information criteria changes if we use df_model based on trace(hat_matrix)
         #need leave-one-out, gcv; or some penalization for weak priors
         #added extra penalization for lambd
@@ -336,7 +336,7 @@ def hatmatrix_diag(self):
         '''
         # TODO is this still correct with sandwich normalized_cov_params, I guess not
         xpxi = self.model.normalized_cov_params
-        #something fishy with self.normalized_cov_params in result, doesn't update
+        #something fishy with self.normalized_cov_params in result, does not update
         #print(self.model.wexog.shape, np.dot(xpxi, self.model.wexog.T).shape
         return (self.model.wexog * np.dot(xpxi, self.model.wexog.T).T).sum(1)
 
@@ -346,7 +346,7 @@ def hatmatrix_trace(self):
         """
         return self.hatmatrix_diag.sum()
 
-##    #this doesn't update df_resid
+##    #this does not update df_resid
 ##    @property   #needs to be property or attribute (no call)
 ##    def df_model(self):
 ##        return self.hatmatrix_trace()
diff --git a/statsmodels/sandbox/regression/predstd.py b/statsmodels/sandbox/regression/predstd.py
index 31ef7f97588..75f477bdf7d 100644
--- a/statsmodels/sandbox/regression/predstd.py
+++ b/statsmodels/sandbox/regression/predstd.py
@@ -68,7 +68,7 @@ def wls_prediction_std(res, exog=None, weights=None, alpha=0.05):
 
     '''
     # work around current bug:
-    #    fit doesn't attach results to model, predict broken
+    #    fit does not attach results to model, predict broken
     #res.model.results
 
     covb = res.cov_params()
diff --git a/statsmodels/sandbox/regression/runmnl.py b/statsmodels/sandbox/regression/runmnl.py
index 47d2f8ad878..636605d8cb8 100644
--- a/statsmodels/sandbox/regression/runmnl.py
+++ b/statsmodels/sandbox/regression/runmnl.py
@@ -93,7 +93,7 @@ def loglike(self, params):
         xb = self.xbetas(params)
         expxb = np.exp(xb)
         sumexpxb = expxb.sum(1)#[:,None]
-        probs = expxb/expxb.sum(1)[:,None]  #we don't really need this for all
+        probs = expxb/expxb.sum(1)[:,None]  #we do not really need this for all
         loglike = (self.endog * np.log(probs)).sum(1)
         #is this the same: YES
         #self.logliketest = (self.endog * xb).sum(1) - np.log(sumexpxb)
@@ -110,7 +110,7 @@ class TryNCLogit(object):
     '''
     Nested Conditional Logit (RUNMNL), data handling test
 
-    unfinished, doesn't do anything yet
+    unfinished, does not do anything yet
 
     '''
 
@@ -240,10 +240,10 @@ def calc_prob(self, tree, keys=None):
 dta = np.genfromtxt('TableF23-2.txt', skip_header=1,
                     names='Mode   Ttme   Invc    Invt      GC     Hinc    PSize'.split())
 
-endog = dta['Mode'].reshape(-1,4).copy() #I don't want a view
+endog = dta['Mode'].reshape(-1,4).copy() #I do not want a view
 nobs, nchoices = endog.shape
 datafloat = dta.view(float).reshape(-1,7)
-exog = datafloat[:,1:].reshape(-1,6*nchoices).copy() #I don't want a view
+exog = datafloat[:,1:].reshape(-1,6*nchoices).copy() #I do not want a view
 
 print(endog.sum(0))
 varnames = dta.dtype.names
@@ -289,7 +289,7 @@ def calc_prob(self, tree, keys=None):
 xi = []
 for ii in range(4):
     xi.append(dta1[xivar[ii]][choice_index==ii])
-    #this doesn't change sequence of columns, bug report by Skipper I think
+    #this does not change sequence of columns, bug report by Skipper I think
 
 ncommon = 2
 betaind = [len(xi[ii].dtype.names)-ncommon for ii in range(4)]
diff --git a/statsmodels/sandbox/regression/sympy_diff.py b/statsmodels/sandbox/regression/sympy_diff.py
index 758e9138657..28139e5a98f 100644
--- a/statsmodels/sandbox/regression/sympy_diff.py
+++ b/statsmodels/sandbox/regression/sympy_diff.py
@@ -57,6 +57,6 @@ def cdf(x, mu, sigma):
 
 print(dlddf.subs(dict(y=1,mu=1,sigma2=1.5,df=10.0001)))
 print(dlddf.subs(dict(y=1,mu=1,sigma2=1.5,df=10.0001)).evalf())
-# Note: derivatives of nested function doesn't work in sympy
+# Note: derivatives of nested function does not work in sympy
 #       at least not higher order derivatives (second or larger)
 #       looks like print(failure
diff --git a/statsmodels/sandbox/regression/tests/test_gmm.py b/statsmodels/sandbox/regression/tests/test_gmm.py
index 948e93e7fa1..93e4a9abc1d 100644
--- a/statsmodels/sandbox/regression/tests/test_gmm.py
+++ b/statsmodels/sandbox/regression/tests/test_gmm.py
@@ -351,7 +351,7 @@ def test_bse_other(self):
         # TODO: next two produce the same as before (looks like)
         bse = np.sqrt(np.diag((res1._cov_params(has_optimal_weights=False))))
                                             #weights=res1.weights))))
-        # TODO: doesn't look different
+        # TODO: does not look different
         #assert_allclose(res1.bse, res2.bse, rtol=5e-06, atol=0)
         #nobs = instrument.shape[0]
         #w0inv = np.dot(instrument.T, instrument) / nobs
@@ -429,13 +429,13 @@ def test_bse_other(self):
         # TODO: next two produce the same as before (looks like)
         bse = np.sqrt(np.diag((res1._cov_params(has_optimal_weights=False,
                                             weights=res1.weights))))
-        # TODO: doesn't look different
+        # TODO: does not look different
         #assert_allclose(res1.bse, res2.bse, rtol=5e-06, atol=0)
         bse = np.sqrt(np.diag((res1._cov_params(has_optimal_weights=False))))
                                                 #use_weights=True #weights=w
         #assert_allclose(res1.bse, res2.bse, rtol=5e-06, atol=0)
 
-        #This doesn't replicate Stata oneway either
+        #This does not replicate Stata oneway either
         nobs = instrument.shape[0]
         w0inv = np.dot(instrument.T, instrument) / nobs
         q = self.res1.model.gmmobjective(self.res1.params, w)#self.res1.weights)
@@ -655,7 +655,7 @@ def test_basic(self):
 
         # TODO: resolve this
         # try bse from previous step, is closer to Stata
-        # guess: Stata ivreg2 doesn't calc for bse update after final iteration
+        # guess: Stata ivreg2 does not calc for bse update after final iteration
         # need better test case, bse difference is close to numerical optimization precision
         assert_allclose(self.res3.bse, res2.bse, rtol=5e-05, atol=0)
         assert_allclose(self.res3.bse, res2.bse, rtol=0, atol=5e-06)
diff --git a/statsmodels/sandbox/regression/treewalkerclass.py b/statsmodels/sandbox/regression/treewalkerclass.py
index 07cf589fc1b..c40562438e8 100644
--- a/statsmodels/sandbox/regression/treewalkerclass.py
+++ b/statsmodels/sandbox/regression/treewalkerclass.py
@@ -74,7 +74,7 @@
   (? check transformation) to sample frequencies and zeros for slope
   coefficient as starting values for (non-nested) MNL
 * associated test statistics
-  - (I don't think I will fight with the gradient or hessian of the log-like.)
+  - (I do not think I will fight with the gradient or hessian of the log-like.)
   - basic MLE statistics can be generic
   - tests specific to the model (?)
 * nice printouts since I'm currently collecting a lot of information in the tree
@@ -353,7 +353,7 @@ def calc_prob(self, tree, parent=None):
                 if DEBUG:
                     print(b)
                 bv = self.calc_prob(b, name)
-                bv = np.exp(bv/tau)  #this shouldn't be here, when adding branch data
+                bv = np.exp(bv/tau)  #this should not be here, when adding branch data
                 branchvalue.append(bv)
                 branchsum = branchsum + bv
             self.branchvalues[name] = branchvalue #keep track what was returned
diff --git a/statsmodels/sandbox/regression/try_ols_anova.py b/statsmodels/sandbox/regression/try_ols_anova.py
index 8217570f3f9..3892e733e2d 100644
--- a/statsmodels/sandbox/regression/try_ols_anova.py
+++ b/statsmodels/sandbox/regression/try_ols_anova.py
@@ -94,7 +94,7 @@ def anovadict(res):
     not checked for completeness
     '''
     ad = {}
-    ad.update(res.__dict__)  #dict doesn't work with cached attributes
+    ad.update(res.__dict__)  #dict does not work with cached attributes
     anova_attr = ['df_model', 'df_resid', 'ess', 'ssr','uncentered_tss',
                  'mse_model', 'mse_resid', 'mse_total', 'fvalue', 'f_pvalue',
                   'rsquared']
@@ -139,7 +139,7 @@ def form2design(ss, data):
     Notes
     -----
 
-    with sorted dict, separate name list wouldn't be necessary
+    with sorted dict, separate name list would not be necessary
     '''
     vars = {}
     names = []
@@ -242,7 +242,7 @@ def dropname(ss, li):
     m = dta.mask.view(bool)
     droprows = m.reshape(-1,len(dta.dtype.names)).any(1)
     # get complete data as plain structured array
-    # maybe doesn't work with masked arrays
+    # maybe does not work with masked arrays
     dta_use_b1 = dta[~droprows,:].data
     print(dta_use_b1.shape)
     print(dta_use_b1.dtype)
diff --git a/statsmodels/sandbox/regression/try_treewalker.py b/statsmodels/sandbox/regression/try_treewalker.py
index 2c823cae1cf..266db46d026 100644
--- a/statsmodels/sandbox/regression/try_treewalker.py
+++ b/statsmodels/sandbox/regression/try_treewalker.py
@@ -38,7 +38,7 @@ def branch(tree):
 
 
 #new version that also keeps track of branch name and allows V_j for a branch
-#   as in Greene, V_j + lamda * IV doesn't look the same as including the
+#   as in Greene, V_j + lamda * IV does not look the same as including the
 #   explanatory variables in leaf X_j, V_j is linear in X, IV is logsumexp of X,
 
 
diff --git a/statsmodels/sandbox/stats/diagnostic.py b/statsmodels/sandbox/stats/diagnostic.py
index 1b651aa9370..b5e798ec5bf 100644
--- a/statsmodels/sandbox/stats/diagnostic.py
+++ b/statsmodels/sandbox/stats/diagnostic.py
@@ -155,7 +155,7 @@ class CompareJ(object):
     From description in Greene, section 8.3.3
 
     produces correct results for Example 8.3, Greene - not checked yet
-    #currently an exception, but I don't have clean reload in python session
+    #currently an exception, but I do not have clean reload in python session
 
     check what results should be attached
 
@@ -302,7 +302,7 @@ def acorr_ljungbox(x, lags=None, boxpierce=False):
 def acorr_lm(x, maxlag=None, autolag='AIC', store=False, regresults=False):
     '''Lagrange Multiplier tests for autocorrelation
 
-    This is a generic Lagrange Multiplier test for autocorrelation. I don't
+    This is a generic Lagrange Multiplier test for autocorrelation. I do not
     have a reference for it, but it returns Engle's ARCH test if x is the
     squared residual array. A variation on it with additional exogenous
     variables is the Breusch-Godfrey autocorrelation test.
@@ -573,7 +573,7 @@ def het_breuschpagan(resid, exog_het):
     Assumes x contains constant (for counting dof and calculation of R^2).
     In the general description of LM test, Greene mentions that this test
     exaggerates the significance of results in small or moderately large
-    samples. In this case the F-statistic is preferrable.
+    samples. In this case the F-statistic is preferable.
 
     *Verification*
 
@@ -949,7 +949,7 @@ def linear_lm(resid, exog, func=None):
     '''Lagrange multiplier test for linearity against functional alternative
 
     limitations: Assumes currently that the first column is integer.
-    Currently it doesn't check whether the transformed variables contain NaNs,
+    Currently it does not check whether the transformed variables contain NaNs,
     for example log of negative number.
 
     Parameters
@@ -1397,7 +1397,7 @@ def breaks_cusumolsresid(olsresidual, ddof=0):
 
     Not clear: Assumption 2 in Ploberger, Kramer assumes that exog x have
     asymptotically zero mean, x.mean(0) = [1, 0, 0, ..., 0]
-    Is this really necessary? I don't see how it can affect the test statistic
+    Is this really necessary? I do not see how it can affect the test statistic
     under the null. It does make a difference under the alternative.
     Also, the asymptotic distribution of test statistic depends on this.
 
diff --git a/statsmodels/sandbox/stats/multicomp.py b/statsmodels/sandbox/stats/multicomp.py
index d39b665b2bc..74a34a2863c 100644
--- a/statsmodels/sandbox/stats/multicomp.py
+++ b/statsmodels/sandbox/stats/multicomp.py
@@ -7,7 +7,7 @@
  - one example taken from lecture notes looks ok
  - needs cases with non-monotonic inequality for test to see difference between
    one-step, step-up and step-down procedures
- - FDR doesn't look really better then Bonferoni in the MC examples that I tried
+ - FDR does not look really better then Bonferoni in the MC examples that I tried
 update:
  - now tested against R, stats and multtest,
    I have all of their methods for p-value correction
@@ -15,7 +15,7 @@
  - now, since I have p-values correction, some of the original tests (rej/norej)
    implementation is not really needed anymore. I think I keep it for reference.
    Test procedure for Hommel in development session log
- - I haven't updated other functions and classes in here.
+ - I have not updated other functions and classes in here.
    - multtest has some good helper function according to docs
  - still need to update references, the real papers
  - fdr with estimated true hypothesis still missing
@@ -49,7 +49,7 @@
 for multicomparison
 
 new book "multiple comparison in R"
-Hsu is a good reference but I don't have it.
+Hsu is a good reference but I do not have it.
 
 
 Author: Josef Pktd and example from H Raja and rewrite from Vincent Davis
@@ -379,7 +379,7 @@ def rejectionline(n, alpha=0.5):
 
 
 
-#I don't remember what I changed or why 2 versions,
+#I do not remember what I changed or why 2 versions,
 #this follows german diss ???  with rline
 #this might be useful if the null hypothesis is not "all effects are zero"
 #rename to _bak and working again on fdrcorrection0
@@ -661,7 +661,7 @@ def plot_simultaneous(self, comparison_name=None, ax=None, figsize=(10,6),
                           xlabel=None, ylabel=None):
         """Plot a universal confidence interval of each group mean
 
-        Visiualize significant differences in a plot with one confidence
+        Visualize significant differences in a plot with one confidence
         interval per group instead of all pairwise confidence intervals.
 
         Parameters
@@ -1536,7 +1536,7 @@ class StepDown(object):
 
     One change to make it more flexible, is to separate out the decision on a subset,
     also because the F-based tests, FREGW in SPSS, take information from all elements of
-    a set and not just pairwise comparisons. I haven't looked at the details of
+    a set and not just pairwise comparisons. I have not looked at the details of
     the F-based tests such as Sheffe yet. It looks like running an F-test on equality
     of means in each subset. This would also outsource how pairwise conditions are
     combined, any larger or max. This would also imply that the distance matrix cannot
@@ -1708,7 +1708,7 @@ def set_partition(ssli):
     '''extract a partition from a list of tuples
 
     this should be correctly called select largest disjoint sets.
-    Begun and Gabriel 1981 don't seem to be bothered by sets of accepted
+    Begun and Gabriel 1981 do not seem to be bothered by sets of accepted
     hypothesis with joint elements,
     e.g. maximal_accepted_sets = { {1,2,3}, {2,3,4} }
 
@@ -1716,7 +1716,7 @@ def set_partition(ssli):
     It tries to find the partition with the largest sets. That is, sets are
     included after being sorted by length.
 
-    If the list doesn't include the singletons, then it will be only a
+    If the list does not include the singletons, then it will be only a
     partial partition. Missing items are singletons (I think).
 
     Examples
@@ -1763,7 +1763,7 @@ def set_remove_subs(ssli):
     [(1, 1, 1, 2, 3), (0, 1)]
 
     '''
-    #TODO: maybe convert all tuples to sets immediately, but I don't need the extra efficiency
+    #TODO: maybe convert all tuples to sets immediately, but I do not need the extra efficiency
     part = []
     for s in sorted(list(set(ssli)), key=lambda x: len(set(x)))[::-1]:
         #print(s,
diff --git a/statsmodels/sandbox/stats/notes_fdr.txt b/statsmodels/sandbox/stats/notes_fdr.txt
index d027c346640..b53590c580f 100644
--- a/statsmodels/sandbox/stats/notes_fdr.txt
+++ b/statsmodels/sandbox/stats/notes_fdr.txt
@@ -5,9 +5,9 @@ Multiple Tests and Multiple Comparisons
 Introduction
 ------------
 
-generic multiple testing procedures, p-value corrections and fdr don't use
+generic multiple testing procedures, p-value corrections and fdr do not use
 any additional information, only information contained in p-values.
-I don't know if there are any underlying assumption, except that the raw
+I do not know if there are any underlying assumption, except that the raw
 pvalues are uniformly on [0,1] distributed under the null hypothesis.
 
 fdr for microarray or fmri can use special structure
diff --git a/statsmodels/sandbox/stats/runs.py b/statsmodels/sandbox/stats/runs.py
index cf4f4d3599c..8d6d92d0bc7 100644
--- a/statsmodels/sandbox/stats/runs.py
+++ b/statsmodels/sandbox/stats/runs.py
@@ -3,7 +3,7 @@
 formulas for mean and var of runs taken from SAS manual NPAR tests, also idea
 for runstest_1samp and runstest_2samp
 
-Description in NIST handbook and dataplot doesn't explain their expected
+Description in NIST handbook and dataplot does not explain their expected
 values, or variance
 
 Note:
@@ -184,7 +184,7 @@ def runstest_2samp(x, y=None, groups=None, correction=True):
     maximum number of runs would use alternating groups in the ties.)
     Maybe adding random noise would be the better approach.
 
-    SAS has exact distribution for sample size <=30, doesn't look standard
+    SAS has exact distribution for sample size <=30, does not look standard
     but should be easy to add.
 
     currently two-sided test only
@@ -225,7 +225,7 @@ def runstest_2samp(x, y=None, groups=None, correction=True):
         print('ties detected')   #replace with warning
         x_mindiff = x_diff[x_diff > 0].min()
         eps = x_mindiff/2.
-        xx = x.copy()  #don't change original, just in case
+        xx = x.copy()  #do not change original, just in case
 
         xx[groups==gruni[0]] += eps
         xargsort = np.argsort(xx)
@@ -256,7 +256,7 @@ class TotalRunsProb(object):
 
     Notes
     -----
-    Written as a class so I can store temporary calculations, but I don't
+    Written as a class so I can store temporary calculations, but I do not
     think it matters much.
 
     Formulas taken from SAS manual for one-sided significance level.
@@ -544,7 +544,7 @@ def mcnemar(x, y=None, exact=True, correction=True):
         n1, n2 = x[1, 0], x[0, 1]
     else:
         # I'm not checking here whether x and y are binary,
-        # isn't this also paired sign test
+        # is not this also paired sign test
         n1 = np.sum(x < y, 0)
         n2 = np.sum(x > y, 0)
 
@@ -608,7 +608,7 @@ def symmetry_bowker(table):
     if k != k2:
         raise ValueError('table needs to be square')
 
-    #low_idx = np.tril_indices(k, -1)  # this doesn't have Fortran order
+    #low_idx = np.tril_indices(k, -1)  # this does not have Fortran order
     upp_idx = np.triu_indices(k, 1)
 
     tril = table.T[upp_idx]   # lower triangle in column order
diff --git a/statsmodels/sandbox/stats/stats_dhuard.py b/statsmodels/sandbox/stats/stats_dhuard.py
index 2fef5e78399..968df2cb607 100644
--- a/statsmodels/sandbox/stats/stats_dhuard.py
+++ b/statsmodels/sandbox/stats/stats_dhuard.py
@@ -6,7 +6,7 @@
 Notes
 =====
 
-out of bounds interpolation raises exception and wouldn't be completely
+out of bounds interpolation raises exception and would not be completely
 defined ::
 
 >>> scoreatpercentile(x, [0,25,50,100])
@@ -52,7 +52,7 @@
 * ppf how do I get the inverse function of a higher order spline?
   Chuck: resample and fit spline to inverse function
   this will have an approximation error in the inverse function
-* -> doesn't work: higher order spline doesn't preserve monotonicity
+* -> does not work: higher order spline does not preserve monotonicity
   see mailing list for response to my question
 * pmf from derivative available in spline
 
@@ -316,7 +316,7 @@ def optimize_binning(self, method='Freedman'):
     ppfs = ppfintp(cdf_ongrid)
     plt.plot(ppfs, cdf_ongrid)
     #ppfemp=interpolate.InterpolatedUnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3)
-    #Don't use interpolating splines for function approximation
+    #Do not use interpolating splines for function approximation
     #with s=0.03 the spline is monotonic at the evaluated values
     ppfemp=interpolate.UnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3, s=0.03)
     ppfe = ppfemp(cdf_ongrid)
diff --git a/statsmodels/sandbox/stats/stats_mstats_short.py b/statsmodels/sandbox/stats/stats_mstats_short.py
index 2d3008162f8..a87edf98255 100644
--- a/statsmodels/sandbox/stats/stats_mstats_short.py
+++ b/statsmodels/sandbox/stats/stats_mstats_short.py
@@ -4,7 +4,7 @@
   - data is masked array
   - data requires nan handling (masknan=True)
   - data should be trimmed (limit is non-empty)
-handle simple cases directly, which doesn't require apply_along_axis
+handle simple cases directly, which does not require apply_along_axis
 changes compared to mstats: plotting_positions for n-dim with axis argument
 addition: plotting_positions_w1d: with weights, 1d ndarray only
 
@@ -16,7 +16,7 @@
 rename alphap, betap for consistency
 timing question: one additional argsort versus apply_along_axis
 weighted plotting_positions
-- I haven't figured out nd version of weighted plotting_positions
+- I have not figured out nd version of weighted plotting_positions
 - add weighted quantiles
 
 
@@ -262,7 +262,7 @@ def plotting_positions(data, alpha=0.4, beta=0.4, axis=0, masknan=False):
         plpos = np.empty(data.shape, dtype=float)
         plpos[data.argsort()] = (np.arange(1,n+1) - alpha)/(n+1.-alpha-beta)
     else:
-        #nd assignment instead of second argsort doesn't look easy
+        #nd assignment instead of second argsort does not look easy
         plpos = (data.argsort(axis).argsort(axis) + 1. - alpha)/(n+1.-alpha-beta)
     return plpos
 
@@ -344,7 +344,7 @@ def edf_normal_inverse_transformed(x, alpha=3./8, beta=3./8, axis=0):
         print((quantiles(xm, axis=ax) == quantiles(x, axis=ax)).all())
         print((stats.mstats.mquantiles(ma.fix_invalid(x2), axis=ax) == quantiles(x2, axis=ax, masknan=1)).all())
 
-    #stats version doesn't have axis
+    #stats version does not have axis
     print((stats.mstats.plotting_positions(ma.fix_invalid(x2)) == plotting_positions(x2, axis=None, masknan=1)).all())
 
     #test 3d
diff --git a/statsmodels/sandbox/sysreg.py b/statsmodels/sandbox/sysreg.py
index 47d8e67cfd4..8b91359e096 100644
--- a/statsmodels/sandbox/sysreg.py
+++ b/statsmodels/sandbox/sysreg.py
@@ -76,7 +76,7 @@ class SUR(object):
 
     Notes
     -----
-    All individual equations are assumed to be well-behaved, homoeskedastic
+    All individual equations are assumed to be well-behaved, homoskedastic
     iid errors.  This is basically an extension of GLS, using sparse matrices.
 
     .. math:: \\Sigma=\\left[\\begin{array}{cccc}
@@ -178,7 +178,7 @@ def _compute_sigma(self, resids):
                     div[i+j] = nobs - np.max(self.df_model[i]+1,
                         self.df_model[j]+1)
             div.reshape(M,M)
-# doesn't handle (#,)
+# does not handle (#,)
         self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sig/div)).T
         return sig/div
 
@@ -238,7 +238,7 @@ def fit(self, igls=False, tol=1e-5, maxiter=100):
         while igls and (np.any(np.abs(conv[-2] - conv[-1]) > tol)) and \
                 (self.iterations < maxiter):
             fittedvalues = (self.sp_exog*beta).reshape(M,-1)
-            resids = self.endog - fittedvalues # don't attach results yet
+            resids = self.endog - fittedvalues # do not attach results yet
             self.sigma = self._compute_sigma(resids) # need to attach for compute?
             self.wendog = self.whiten(self.endog)
             self.wexog = self.whiten(self.sp_exog)
@@ -269,7 +269,7 @@ class Sem2SLS(object):
     indep_endog : dict
         A dictionary mapping the equation to the column numbers of the
         the independent endogenous regressors in each equation.
-        It is assumed that the system is inputed as broken up into
+        It is assumed that the system is entered as broken up into
         LHS and RHS. For now, the values of the dict have to be sequences.
         Note that the keys for the equations should be zero-indexed.
     instruments : array
diff --git a/statsmodels/sandbox/tests/test_gam.py b/statsmodels/sandbox/tests/test_gam.py
index 411a7d96c94..242efb29a76 100644
--- a/statsmodels/sandbox/tests/test_gam.py
+++ b/statsmodels/sandbox/tests/test_gam.py
@@ -12,10 +12,10 @@
 -----
 
 TODO: TestGAMGamma: has test failure (GLM looks good),
-        adding log-link didn't help
-        resolved: gamma doesn't fail anymore after tightening the
+        adding log-link did not help
+        resolved: gamma does not fail anymore after tightening the
                   convergence criterium (rtol=1e-6)
-TODO: TestGAMNegativeBinomial: rvs generation doesn't work,
+TODO: TestGAMNegativeBinomial: rvs generation does not work,
         nbinom needs 2 parameters
 TODO: TestGAMGaussianLogLink: test failure,
         but maybe precision issue, not completely off
@@ -198,7 +198,7 @@ def setup_class(cls):
         res1.params = np.array([const] + slopes)
 
     def test_fitted(self):
-        # We have to override the base class because this case doesn't fail,
+        # We have to override the base class because this case does not fail,
         #  while all others in this module do (as of 2019-05-22)
         super(TestAdditiveModel, self).test_fitted()
 
@@ -219,7 +219,7 @@ def init(cls):
         cls.mu_true = mu_true = f.link.inverse(y_true)
 
         np.random.seed(8765993)
-        # Discrete distributions don't take `scale`.
+        # Discrete distributions do not take `scale`.
         try:
             y_obs = cls.rvs(mu_true, scale=scale, size=nobs)
         except TypeError:
@@ -314,7 +314,7 @@ def setup_class(cls):
                           "to _parse_args_rvs",
                    strict=True, raises=TypeError)
 class TestGAMNegativeBinomial(BaseGAM):
-    # TODO: rvs generation doesn't work, nbinom needs 2 parameters
+    # TODO: rvs generation does not work, nbinom needs 2 parameters
 
     @classmethod
     def setup_class(cls):
diff --git a/statsmodels/sandbox/tools/TODO.txt b/statsmodels/sandbox/tools/TODO.txt
index 6da243a35ba..b35b0e9a945 100644
--- a/statsmodels/sandbox/tools/TODO.txt
+++ b/statsmodels/sandbox/tools/TODO.txt
@@ -58,7 +58,7 @@
 
 * other models in draft stage  -> requires cleaning
   - gaussian process   -> might fit in
-  - multinomial logit -> requires ML, result statistics don't fit into current classes ?
+  - multinomial logit -> requires ML, result statistics do not fit into current classes ?
 
 * stochastic processes, time series  -> first step is relatively easy
   more simulators, random process generators, for fun and Monte Carlo and testing
diff --git a/statsmodels/sandbox/tools/cross_val.py b/statsmodels/sandbox/tools/cross_val.py
index 59e1182b4a2..4f1a0f1043f 100644
--- a/statsmodels/sandbox/tools/cross_val.py
+++ b/statsmodels/sandbox/tools/cross_val.py
@@ -307,7 +307,7 @@ def __init__(self, n, k=1, start=None, kall=True, return_slice=True):
 
         Notes
         -----
-        I don't think this is really useful, because it can be done with
+        I do not think this is really useful, because it can be done with
         a very simple loop instead.
         Useful as a plugin, but it could return slices instead for faster array access.
 
diff --git a/statsmodels/sandbox/tools/mctools.py b/statsmodels/sandbox/tools/mctools.py
index 22e0f074992..485dc381e77 100644
--- a/statsmodels/sandbox/tools/mctools.py
+++ b/statsmodels/sandbox/tools/mctools.py
@@ -63,7 +63,7 @@ class StatTestMC(object):
        not be updated, and, therefore, not correspond to the same run.
 
     .. Warning::
-       Under Construction, don't expect stability in Api or implementation
+       Under Construction, do not expect stability in Api or implementation
 
 
     Examples
@@ -165,7 +165,7 @@ def histogram(self, idx=None, critval=None):
 
         does not do any plotting
 
-        I don't remember what I wanted here, looks similar to the new cdf
+        I do not remember what I wanted here, looks similar to the new cdf
         method, but this also does a binned pdf (self.histo)
 
 
@@ -331,7 +331,7 @@ def plot_hist(self, idx, distpdf=None, bins=50, ax=None, kwds=None):
 
 
         import matplotlib.pyplot as plt
-        #I don't want to figure this out now
+        #I do not want to figure this out now
 #        if ax=None:
 #            fig = plt.figure()
 #            ax = fig.addaxis()
diff --git a/statsmodels/sandbox/tools/tools_pca.py b/statsmodels/sandbox/tools/tools_pca.py
index 8faa5ce16c3..c13d1ff1528 100644
--- a/statsmodels/sandbox/tools/tools_pca.py
+++ b/statsmodels/sandbox/tools/tools_pca.py
@@ -47,7 +47,7 @@ def pca(data, keepdim=0, normalize=0, demean=True):
 
     '''
     x = np.array(data)
-    #make copy so original doesn't change, maybe not necessary anymore
+    #make copy so original does not change, maybe not necessary anymore
     if demean:
         m = x.mean(0)
     else:
@@ -114,13 +114,13 @@ def pcasvd(data, keepdim=0, demean=True):
 
     Notes
     -----
-    This doesn't have yet the normalize option of pca.
+    This does not have yet the normalize option of pca.
 
     '''
     nobs, nvars = data.shape
     #print nobs, nvars, keepdim
     x = np.array(data)
-    #make copy so original doesn't change
+    #make copy so original does not change
     if demean:
         m = x.mean(0)
     else:
diff --git a/statsmodels/sandbox/tsa/diffusion.py b/statsmodels/sandbox/tsa/diffusion.py
index ab82b5a33f6..3c5280ded3b 100644
--- a/statsmodels/sandbox/tsa/diffusion.py
+++ b/statsmodels/sandbox/tsa/diffusion.py
@@ -101,7 +101,7 @@ def __init__(self):
         pass
 
     def sim(self, nobs=100, T=1, dt=None, nrepl=1):
-        # this doesn't look correct if drift or sig depend on x
+        # this does not look correct if drift or sig depend on x
         # see arithmetic BM
         W, t = self.simulateW(nobs=nobs, T=T, dt=dt, nrepl=nrepl)
         dx =  self._drift() + self._sig() * W
@@ -179,7 +179,7 @@ def exactprocess(self, xzero, nobs, ddt=1., nrepl=2):
         expddt = np.exp(-self.lambd * ddt)
         normrvs = np.random.normal(size=(nrepl,nobs))
         #do I need lfilter here AR(1) ? if mean reverting lag-coeff<1
-        #lfilter doesn't handle 2d arrays, it does?
+        #lfilter does not handle 2d arrays, it does?
         inc = self._exactconst(expddt) + self._exactstd(expddt) * normrvs
         return signal.lfilter([1.], [1.,-expddt], inc)
 
@@ -293,7 +293,7 @@ def exactprocess(self, xzero, nobs, ddt=1., nrepl=2):
         expnt = np.exp(-self.lambd * t)
         expddt = np.exp(-self.lambd * ddt)
         normrvs = np.random.normal(size=(nrepl,nobs))
-        #do I need lfilter here AR(1) ? lfilter doesn't handle 2d arrays, it does?
+        #do I need lfilter here AR(1) ? lfilter does not handle 2d arrays, it does?
         from scipy import signal
         #xzero * expnt
         inc = ( self.mu * (1-expddt) +
diff --git a/statsmodels/sandbox/tsa/example_arma.py b/statsmodels/sandbox/tsa/example_arma.py
index dd82c66f896..f1343a28380 100644
--- a/statsmodels/sandbox/tsa/example_arma.py
+++ b/statsmodels/sandbox/tsa/example_arma.py
@@ -189,7 +189,7 @@ def autocorr(s, axis=-1):
 
 #JP: with valid this returns a single value, if x and y have same length
 #   e.g. norm_corr(x, x)
-#   using std subtracts mean, but correlate doesn't, requires means are exactly 0
+#   using std subtracts mean, but correlate does not, requires means are exactly 0
 #   biased, no n-k correction for laglength
 #from nitime.utils
 def norm_corr(x,y,mode = 'valid'):
diff --git a/statsmodels/sandbox/tsa/examples/ex_mle_garch.py b/statsmodels/sandbox/tsa/examples/ex_mle_garch.py
index a9a1409fe8a..7d826a7754b 100644
--- a/statsmodels/sandbox/tsa/examples/ex_mle_garch.py
+++ b/statsmodels/sandbox/tsa/examples/ex_mle_garch.py
@@ -233,7 +233,7 @@
 '''based on R default simulation
 model = list(omega = 1e-06, alpha = 0.1, beta = 0.8)
 nobs = 1000
-(with nobs=500, gjrgarch doesn't do well
+(with nobs=500, gjrgarch does not do well
 
 >>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000)
 Optimization terminated successfully.
diff --git a/statsmodels/sandbox/tsa/fftarma.py b/statsmodels/sandbox/tsa/fftarma.py
index 2b9a68d5b85..5b20845b288 100644
--- a/statsmodels/sandbox/tsa/fftarma.py
+++ b/statsmodels/sandbox/tsa/fftarma.py
@@ -17,10 +17,10 @@
     -> use pade (scipy.misc) approximation to get starting polynomial
        from autocorrelation (is autocorrelation of AR(p) related to marep?)
        check if pade is fast, not for larger arrays ?
-       maybe pade doesn't do the right thing for this, not tried yet
+       maybe pade does not do the right thing for this, not tried yet
        scipy.pade([ 1.    ,  0.6,  0.25, 0.125, 0.0625, 0.1],2)
        raises LinAlgError: singular matrix
-       also doesn't have roots inside unit circle ??
+       also does not have roots inside unit circle ??
     -> even without initialization, it might be fast for estimation
     -> how do I enforce stationarity and invertibility,
        need helper function
@@ -49,7 +49,7 @@ class ArmaFft(ArmaProcess):
     Notes
     -----
     TODO:
-    check whether we don't want to fix maxlags, and create new instance if
+    check whether we do not want to fix maxlags, and create new instance if
     maxlag changes. usage for different lengths of timeseries ?
     or fix frequency and length for fft
 
@@ -192,7 +192,7 @@ def spd(self, npos):
         n = npos
         w = fft.fftfreq(2*n) * 2 * np.pi
         hw = self.fftarma(2*n)  #not sure, need to check normalization
-        #return (hw*hw.conj()).real[n//2-1:]  * 0.5 / np.pi #doesn't show in plot
+        #return (hw*hw.conj()).real[n//2-1:]  * 0.5 / np.pi #does not show in plot
         return (hw*hw.conj()).real * 0.5 / np.pi, w
 
     def spdshift(self, n):
diff --git a/statsmodels/sandbox/tsa/garch.py b/statsmodels/sandbox/tsa/garch.py
index 05757a915a2..f25b003e96a 100644
--- a/statsmodels/sandbox/tsa/garch.py
+++ b/statsmodels/sandbox/tsa/garch.py
@@ -6,7 +6,7 @@
 subclass defines geterrors(parameters) besides loglike,...
 and covariance matrix of parameter estimates (e.g. from hessian
 or outerproduct of jacobian)
-update: I don't really need geterrors directly, but get_h the conditional
+update: I do not really need geterrors directly, but get_h the conditional
     variance process
 
 new version Garch0 looks ok, time to clean up and test
@@ -16,7 +16,7 @@
 Notes
 -----
 
-idea: cache intermediate design matrix for geterrors so it doesn't need
+idea: cache intermediate design matrix for geterrors so it does not need
     to be build at each function call
 
 superclass or result class calculates result statistic based
@@ -342,7 +342,7 @@ def geth(self, params):
         icetax = self._icetax  #read ic-eta-x, initial condition
 
         #TODO: where does my go with lfilter ?????????????
-        #      shouldn't matter except for interpretation
+        #      should not matter except for interpretation
 
         nobs = etax.shape[0]
 
@@ -390,7 +390,7 @@ def loglike(self, params):
         sigma2 = np.maximum(h, 1e-6)
         axis = 0
         nobs = len(h)
-        #this doesn't help for exploding paths
+        #this does not help for exploding paths
         #errorsest[np.isnan(errorsest)] = 100
         axis=0 #no choice of axis
 
@@ -459,7 +459,7 @@ def geth(self, params):
         icetax = self._icetax  #read ic-eta-x, initial condition
 
         #TODO: where does my go with lfilter ?????????????
-        #      shouldn't matter except for interpretation
+        #      should not matter except for interpretation
 
         nobs = self.nobs
 
@@ -516,7 +516,7 @@ def loglike(self, params):
         sigma2 = np.maximum(h, 1e-6)
         axis = 0
         nobs = len(h)
-        #this doesn't help for exploding paths
+        #this does not help for exploding paths
         #errorsest[np.isnan(errorsest)] = 100
         axis=0 #no choice of axis
 
@@ -613,7 +613,7 @@ def loglike(self, params):
         sigma2 = np.maximum(h, 1e-6)
         axis = 0
         nobs = len(errorsest)
-        #this doesn't help for exploding paths
+        #this does not help for exploding paths
         #errorsest[np.isnan(errorsest)] = 100
         axis=0 #not used
 #        muy = errorsest.mean()
@@ -870,7 +870,7 @@ def loglike(self, params):
         sigma2 = np.maximum(params[-1]**2, 1e-6)
         axis = 0
         nobs = len(errorsest)
-        #this doesn't help for exploding paths
+        #this does not help for exploding paths
         #errorsest[np.isnan(errorsest)] = 100
 #        llike  =  -0.5 * (np.sum(np.log(sigma2),axis)
 #                          + np.sum((errorsest**2)/sigma2, axis)
@@ -1485,7 +1485,7 @@ def garchplot(err, h, title='Garch simulation'):
     #armodel.fit(method='tnc')
     #powell should be the most robust, see Hamilton 5.7
     armodel.fit(method='powell', penalty=True)
-    # The below don't work yet
+    # The below do not work yet
     #armodel.fit(method='newton', penalty=True)
     #armodel.fit(method='broyden', penalty=True)
     print("Unconditional MLE for AR(1) y_t = .9*y_t-1 +.01 * err")
diff --git a/statsmodels/sandbox/tsa/movstat.py b/statsmodels/sandbox/tsa/movstat.py
index 56d98045a76..a1c5dccc2f0 100644
--- a/statsmodels/sandbox/tsa/movstat.py
+++ b/statsmodels/sandbox/tsa/movstat.py
@@ -12,7 +12,7 @@
 TODO
 
 moving statistics
-- filters don't handle boundary conditions nicely (correctly ?)
+- filters do not handle boundary conditions nicely (correctly ?)
 e.g. minimum order filter uses 0 for out of bounds value
 -> append and prepend with last resp. first value
 - enhance for nd arrays, with axis = 0
diff --git a/statsmodels/sandbox/tsa/notes_organize.txt b/statsmodels/sandbox/tsa/notes_organize.txt
index a80820d2e4a..40cf7312b08 100644
--- a/statsmodels/sandbox/tsa/notes_organize.txt
+++ b/statsmodels/sandbox/tsa/notes_organize.txt
@@ -110,7 +110,7 @@ scikits.statsmodels.sandbox.regression.mle
 ------------------------------------------
 
 one refactoring bug fixed, because arima.ARIMA needs data, use class method instead
-runs without exception, but I didn't look at any results
+runs without exception, but I did not look at any results
 "main" has quite a lot
 
 AR  :      Notes
@@ -145,7 +145,7 @@ script files
 sandbox/tsa/try_arma_more.py
 ----------------------------
 
-imports scikits.talkbox which is not compiled against my current numpy and doesn't
+imports scikits.talkbox which is not compiled against my current numpy and does not
 run
 contains
 arma_periodogram : theoretical periodogram
@@ -209,7 +209,7 @@ not clear parameterization
       A(L)y_t = A(L)x_t + B(L)e_t
 
 - ARMAX  A(L)(y_t - A^{-1}(L) C(L) x_t) = B(L)e_t
-  this doesn't look useful, unless we cutoff A^{-1}(L)
+  this does not look useful, unless we cutoff A^{-1}(L)
 
 problem: signal.lfilter can only handle ARMAX residuals model (I think)
 deterministic trend have ARMAX-simple model, e.g. in unit root tests
diff --git a/statsmodels/sandbox/tsa/try_var_convolve.py b/statsmodels/sandbox/tsa/try_var_convolve.py
index 87f6f2351af..2012ccc764d 100644
--- a/statsmodels/sandbox/tsa/try_var_convolve.py
+++ b/statsmodels/sandbox/tsa/try_var_convolve.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """trying out VAR filtering and multidimensional fft
 
-Note: second half is copy and paste and doesn't run as script
+Note: second half is copy and paste and does not run as script
 incomplete definitions of variables, some I created in shell
 
 Created on Thu Jan 07 12:23:40 2010
@@ -9,7 +9,7 @@
 Author: josef-pktd
 
 update 2010-10-22
-2 arrays were not defined, copied from fft_filter.log.py but I didn't check
+2 arrays were not defined, copied from fft_filter.log.py but I did not check
 what the results are.
 Runs now without raising exception
 """
diff --git a/statsmodels/sandbox/tsa/varma.py b/statsmodels/sandbox/tsa/varma.py
index 13718044c69..1f5344bed65 100644
--- a/statsmodels/sandbox/tsa/varma.py
+++ b/statsmodels/sandbox/tsa/varma.py
@@ -1,6 +1,6 @@
 '''VAR and VARMA process
 
-this doesn't actually do much, trying out a version for a time loop
+this does not actually do much, trying out a version for a time loop
 
 alternative representation:
 * textbook, different blocks in matrices
diff --git a/statsmodels/stats/_diagnostic_other.py b/statsmodels/stats/_diagnostic_other.py
index 33f803d3bac..bdc4e21929f 100644
--- a/statsmodels/stats/_diagnostic_other.py
+++ b/statsmodels/stats/_diagnostic_other.py
@@ -540,7 +540,7 @@ def lm_robust_subset_parts(score, k_constraints,
     This is the same as lm_robust_subset with arguments in parts of
     partitioned matrices.
     This can be useful, when we have the parts based on different estimation
-    procedures, i.e. when we don't have the full unconstrained model.
+    procedures, i.e. when we do not have the full unconstrained model.
 
     Calculates mainly the covariance of the constraint part of the score.
 
@@ -583,7 +583,7 @@ def lm_robust_subset_parts(score, k_constraints,
     instead of calculating the score/lm test.
 
     Implementation similar to lm_robust_subset and is based on Boos 1992,
-    section 4.1 in the form attributed to Breslow (1990). It doesn't use the
+    section 4.1 in the form attributed to Breslow (1990). It does not use the
     computation attributed to Kent (1982) and Engle (1984).
     """
 
@@ -801,7 +801,7 @@ def dispersion_poisson_generic(results, exog_new_test, exog_new_control=None,
         stat_ols = ht.statistic
         pval_ols = ht.pvalue
     else:
-        # we don't have controls and can use overall fit
+        # we do not have controls and can use overall fit
         nobs = endog_v.shape[0]
         rsquared_noncentered = 1 - res_ols.ssr/res_ols.uncentered_tss
         stat_ols = nobs * rsquared_noncentered
@@ -963,7 +963,7 @@ class CMTNewey(object):
     Parameters
     ----------
     moments : ndarray, 1-D
-        moments that are tested to be zero. They don't need to be derived
+        moments that are tested to be zero. They do not need to be derived
         from a likelihood function.
     moments_deriv : ndarray
         derivative of the moment function with respect to the parameters that
@@ -1127,7 +1127,7 @@ class CMTTauchen(object):
         derivative of score function with respect to the parameters that are
         estimated. This is the Hessian in quasi-maximum likelihood
     moments : ndarray, 1-D
-        moments that are tested to be zero. They don't need to be derived
+        moments that are tested to be zero. They do not need to be derived
         from a likelihood function.
     moments_deriv : ndarray
         derivative of the moment function with respect to the parameters that
diff --git a/statsmodels/stats/_knockoff.py b/statsmodels/stats/_knockoff.py
index 832e123c0d8..b38c5fcafa3 100644
--- a/statsmodels/stats/_knockoff.py
+++ b/statsmodels/stats/_knockoff.py
@@ -42,7 +42,7 @@ class RegressionFDR(object):
         An instance of a RegressionEffects class that can compute
         effect sizes for the regression coefficients.
     method : string
-        The approach used to asssess and control FDR, currently
+        The approach used to assess and control FDR, currently
         must be 'knockoff'.
 
     Returns
diff --git a/statsmodels/stats/_lilliefors.py b/statsmodels/stats/_lilliefors.py
index d340b5a1f0a..7f3a44bc005 100644
--- a/statsmodels/stats/_lilliefors.py
+++ b/statsmodels/stats/_lilliefors.py
@@ -198,7 +198,7 @@ def pval_lf(d_max, n):
     Notes
     -----
     This is mainly a helper function where the calling code should dispatch
-    on bound violations. Therefore it doesn't check whether the pvalue is in
+    on bound violations. Therefore it does not check whether the pvalue is in
     the valid range.
 
     Precision for the pvalues is around 2 to 3 decimals. This approximation is
diff --git a/statsmodels/stats/anova.py b/statsmodels/stats/anova.py
index 5ca66611f11..101e909153d 100644
--- a/statsmodels/stats/anova.py
+++ b/statsmodels/stats/anova.py
@@ -259,7 +259,7 @@ def anova3_lm_single(model, design_info, n_rows, test, pr_test, robust):
         index.append(term.name())
 
     table.index = Index(index + ['Residual'])
-    #NOTE: Don't need to sort because terms are an ordered dict now
+    #NOTE: Do not need to sort because terms are an ordered dict now
     #table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1]+1])]
     # back out sum of squares from f_test
     ssr = table[test] * table['df'] * model.ssr/model.df_resid
diff --git a/statsmodels/stats/base.py b/statsmodels/stats/base.py
index 63de289cf4d..fde3986e720 100644
--- a/statsmodels/stats/base.py
+++ b/statsmodels/stats/base.py
@@ -78,7 +78,7 @@ def pval_table(self):
         '''
         k = self.n_levels
         pvals_mat = np.zeros((k, k))
-        # if we don't assume we have all pairs
+        # if we do not assume we have all pairs
         pvals_mat[lzip(*self.all_pairs)] = self.pval_corrected()
         return pvals_mat
 
diff --git a/statsmodels/stats/contrast.py b/statsmodels/stats/contrast.py
index ae02c106a2d..5bb8d59cb67 100644
--- a/statsmodels/stats/contrast.py
+++ b/statsmodels/stats/contrast.py
@@ -127,7 +127,7 @@ def summary(self, xname=None, alpha=0.05, title=None):
             if title is None:
                 title = 'Test for Constraints'
             elif title == '':
-                # don't add any title,
+                # do not add any title,
                 # I think SimpleTable skips on None - check
                 title = None
             # we have everything for a params table
@@ -355,7 +355,7 @@ def __init__(self, statistic, distribution, dist_args, table=None,
         #self.sd = sd
         self.dist_args = dist_args
 
-        # The following is because I don't know which we want
+        # The following is because I do not know which we want
         if table is not None:
             self.statistic = table['statistic'].values
             self.pvalues = table['pvalue'].values
@@ -395,7 +395,7 @@ def summary_frame(self):
         # needs to be a method for consistency
         if hasattr(self, '_dframe'):
             return self._dframe
-        # rename the column nambes, but don't copy data
+        # rename the column nambes, but do not copy data
         renaming = dict(zip(self.table.columns, self.col_names))
         self.dframe = self.table.rename(columns=renaming)
         return self.dframe
diff --git a/statsmodels/stats/correlation_tools.py b/statsmodels/stats/correlation_tools.py
index 9418bb83cc0..2ba708394ba 100644
--- a/statsmodels/stats/correlation_tools.py
+++ b/statsmodels/stats/correlation_tools.py
@@ -335,7 +335,7 @@ def _spg_optim(func, grad, start, project, maxiter=1e4, M=10,
     Notes
     -----
     This can be an effective heuristic algorithm for problems where no
-    gauranteed algorithm for computing a global minimizer is known.
+    guaranteed algorithm for computing a global minimizer is known.
 
     There are a number of tuning parameters, but these generally
     should not be changed except for `maxiter` (positive integer) and
@@ -587,7 +587,7 @@ def corr_nearest_factor(corr, rank, ctol=1e-6, lam_min=1e-30,
     population correlation matrix.  The factor structure allows these
     tasks to be done without constructing any n x n matrices.
 
-    This is a non-convex problem with no known gauranteed globally
+    This is a non-convex problem with no known guaranteed globally
     convergent algorithm for computing the solution.  Borsdof, Higham
     and Raydan (2010) compared several methods for this problem and
     found the spectral projected gradient (SPG) method (used here) to
diff --git a/statsmodels/stats/descriptivestats.py b/statsmodels/stats/descriptivestats.py
index 8a10bf52e12..c138aac56a8 100644
--- a/statsmodels/stats/descriptivestats.py
+++ b/statsmodels/stats/descriptivestats.py
@@ -203,7 +203,7 @@ def summary(self, stats='basic', columns='all', orientation='auto'):
             stats = ('obs', 'mean', 'std', 'min', 'max')
         elif stats == 'all':
             #stats = self.univariate.keys()
-            #dict doesn't keep an order, use full list instead
+            #dict does not keep an order, use full list instead
             stats = ['obs', 'mean', 'std', 'min', 'max', 'ptp', 'var',
                      'mode_val', 'mode_bin', 'median', 'uss', 'skew',
                      'kurtosis', 'percentiles']
@@ -234,7 +234,7 @@ def _fun(per):
 
 
 
-        #JP: this doesn't allow a change in sequence, sequence in stats is
+        #JP: this does not allow a change in sequence, sequence in stats is
         #ignored
         #this is just an if condition
         if any([aitem[1] for aitem in iteritems(self.univariate) if aitem[0] in
diff --git a/statsmodels/stats/inter_rater.py b/statsmodels/stats/inter_rater.py
index f7f19aa04c8..cc84290ef82 100644
--- a/statsmodels/stats/inter_rater.py
+++ b/statsmodels/stats/inter_rater.py
@@ -327,7 +327,7 @@ def cohens_kappa(table, weights=None, return_results=True, wt=None):
 
     weights = '0, 0, 1, 1' and wt = 'linear' means that the first two levels
     are zero distance apart and the same for the last two levels. This is
-    the sampe as forming two aggregated levels by merging the first two and
+    the sample as forming two aggregated levels by merging the first two and
     the last two levels, respectively.
 
     weights = [0, 1, 2, 3] and wt = 'quadratic' is the same as squaring these
diff --git a/statsmodels/stats/libqsturng/CH.r b/statsmodels/stats/libqsturng/CH.r
index 38063ea5510..60d9947523b 100644
--- a/statsmodels/stats/libqsturng/CH.r
+++ b/statsmodels/stats/libqsturng/CH.r
@@ -3,7 +3,7 @@
 
 
 % This is a collection of scripts used to generate C-H comparisons
-% for qsturng. As you can probably guess, my R's skills aren't all that good.
+% for qsturng. As you can probably guess, my R's skills are not all that good.
 
 setwd('D:\\USERS\\roger\\programming\\python\\development\\qsturng')
 
diff --git a/statsmodels/stats/libqsturng/make_tbls.py b/statsmodels/stats/libqsturng/make_tbls.py
index 534a9e7faf9..8dc01f96b21 100644
--- a/statsmodels/stats/libqsturng/make_tbls.py
+++ b/statsmodels/stats/libqsturng/make_tbls.py
@@ -461,7 +461,7 @@ def qhat(a, p, r, v):
         else:
             A[(p,v)] = list(a1)
 
-raise Exception("we don't want to import this")
+raise Exception("we do not want to import this")
 # uncomment the lines below to repr-ize A
 ##import pprint
 ##pprint.pprint(A, width=160)
diff --git a/statsmodels/stats/libqsturng/qsturng_.py b/statsmodels/stats/libqsturng/qsturng_.py
index 6dc40485d96..932148c1dea 100644
--- a/statsmodels/stats/libqsturng/qsturng_.py
+++ b/statsmodels/stats/libqsturng/qsturng_.py
@@ -48,7 +48,7 @@
 # r values for combinations of p and v. In total there are 206
 # estimates over p-values of .5, .75, .9, .95, .975, .99, .995,
 # and .999, and over v (degrees of freedom) of (1) - 20, 24, 30, 40,
-# 60, 120, and inf. combinations with p < .95 don't have coefficients
+# 60, 120, and inf. combinations with p < .95 do not have coefficients
 # for v = 1. Hence the parentheses. These coefficients allow us to
 # form f-hat. f-hat with the inverse t transform of tinv(p,v) yields
 # a fairly accurate estimate of the studentized range distribution
@@ -391,7 +391,7 @@ def _isfloat(x):
 
 def _phi( p ):
     # this function is faster than using scipy.stats.norm.isf(p)
-    # but the permissity of the license isn't explicitly listed.
+    # but the permissity of the license is not explicitly listed.
     # using scipy.stats.norm.isf(p) is an acceptable alternative
 
     """
diff --git a/statsmodels/stats/multitest.py b/statsmodels/stats/multitest.py
index a234ffd05e6..17062b9af02 100644
--- a/statsmodels/stats/multitest.py
+++ b/statsmodels/stats/multitest.py
@@ -384,7 +384,7 @@ def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False,
     linear step-up procedure (fdrcorrection0 with method='indep') corrected
     for the estimated fraction of true hypotheses.
     This means that the rejection decision can be obtained with
-    ``pval_corrected <= alpha``, where ``alpha`` is the origianal significance
+    ``pval_corrected <= alpha``, where ``alpha`` is the original significance
     level.
     (Note: This has changed from earlier versions (<0.5.0) of statsmodels.)
 
@@ -431,7 +431,7 @@ def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False,
             break
         elif ri < ri_old:
             # prevent cycles and endless loops
-            raise RuntimeError(" oops - shouldn't be here")
+            raise RuntimeError(" oops - should not be here")
         ri_old = ri
 
     # make adjustment to pvalscorr to reflect estimated number of Non-Null cases
diff --git a/statsmodels/stats/multivariate_tools.py b/statsmodels/stats/multivariate_tools.py
index e2452e5cfaa..12970171147 100644
--- a/statsmodels/stats/multivariate_tools.py
+++ b/statsmodels/stats/multivariate_tools.py
@@ -100,7 +100,7 @@ def cancorr(x1, x2, demean=True, standardize=False):
         x2 = (x2 - x2.mean(0))
 
     if standardize:
-        #std doesn't make a difference to canonical correlation coefficients
+        #std does not make a difference to canonical correlation coefficients
         x1 /= x1.std(0)
         x2 /= x2.std(0)
 
diff --git a/statsmodels/stats/outliers_influence.py b/statsmodels/stats/outliers_influence.py
index 70552b1f398..8979bb60b25 100644
--- a/statsmodels/stats/outliers_influence.py
+++ b/statsmodels/stats/outliers_influence.py
@@ -178,7 +178,7 @@ def variance_inflation_factor(exog, exog_idx):
 
     See Also
     --------
-    xxx : class for regression diagnostics  TODO: doesn't exist yet
+    xxx : class for regression diagnostics  TODO: does not exist yet
 
     References
     ----------
@@ -543,7 +543,7 @@ def summary_frame(self):
             hat_diag=self.hat_matrix_diag,
             dffits_internal=self.d_fittedvalues_scaled),
             index=row_labels)
-        # NOTE: if we don't give columns, order of above will be arbitrary
+        # NOTE: if we do not give columns, order of above will be arbitrary
         dfbeta = DataFrame(self.dfbetas, columns=beta_labels,
                            index=row_labels)
 
@@ -570,7 +570,7 @@ class OLSInfluence(_BaseInfluenceMixin):
     is not too large. One possible approach for LOOO measures would be to
     identify possible problem observations with the _internal measures, and
     then run the leave-one-observation-out only with observations that are
-    possible outliers. (However, this is not yet available in an automized way.)
+    possible outliers. (However, this is not yet available in an automated way.)
 
     This should be extended to general least squares.
 
@@ -829,7 +829,7 @@ def cov_ratio(self):
         requires leave one out loop for observations
 
         """
-        # don't use inplace division / because then we change original
+        # do not use inplace division / because then we change original
         cov_ratio = (self.det_cov_params_not_obsi
                      / np.linalg.det(self.results.cov_params()))
         return cov_ratio
@@ -1016,7 +1016,7 @@ def summary_frame(self):
             dffits=self.dffits[0],
         ),
             index=row_labels)
-        # NOTE: if we don't give columns, order of above will be arbitrary
+        # NOTE: if we do not give columns, order of above will be arbitrary
         dfbeta = DataFrame(self.dfbetas, columns=beta_labels,
                            index=row_labels)
 
diff --git a/statsmodels/stats/power.py b/statsmodels/stats/power.py
index b650595d88a..9dba38a84a4 100644
--- a/statsmodels/stats/power.py
+++ b/statsmodels/stats/power.py
@@ -43,7 +43,7 @@ def ttest_power(effect_size, nobs, alpha, df=None, alternative='two-sided'):
         df = nobs - 1
 
     if alternative in ['two-sided', '2s']:
-        alpha_ = alpha / 2.  #no inplace changes, doesn't work
+        alpha_ = alpha / 2.  #no inplace changes, does not work
     elif alternative in ['smaller', 'larger']:
         alpha_ = alpha
     else:
@@ -76,7 +76,7 @@ def normal_power(effect_size, nobs, alpha, alternative='two-sided', sigma=1.):
     d = effect_size
 
     if alternative in ['two-sided', '2s']:
-        alpha_ = alpha / 2.  #no inplace changes, doesn't work
+        alpha_ = alpha / 2.  #no inplace changes, does not work
     elif alternative in ['smaller', 'larger']:
         alpha_ = alpha
     else:
diff --git a/statsmodels/stats/proportion.py b/statsmodels/stats/proportion.py
index 9b3b6f9f8db..e6bb7ed28ae 100644
--- a/statsmodels/stats/proportion.py
+++ b/statsmodels/stats/proportion.py
@@ -643,7 +643,7 @@ def power_ztost_prop(low, upp, nobs, p_alt, alpha=0.05, dist='norm',
     alpha : float in (0,1)
         significance level of the test
     dist : string in ['norm', 'binom']
-        This defines the distribution to evalute the power of the test. The
+        This defines the distribution to evaluate the power of the test. The
         critical values of the TOST test are always based on the normal
         approximation, but the distribution for the power can be either the
         normal (default) or the binomial (exact) distribution.
diff --git a/statsmodels/stats/sandwich_covariance.py b/statsmodels/stats/sandwich_covariance.py
index 8c5948031f6..7d8801eefb5 100644
--- a/statsmodels/stats/sandwich_covariance.py
+++ b/statsmodels/stats/sandwich_covariance.py
@@ -64,7 +64,7 @@
 
 quasi-MLE: MLE with mis-specified model where parameter estimates are
 fine (consistent ?) but cov_params needs to be adjusted similar or
-same as in sandwiches. (I didn't go through any details yet.)
+same as in sandwiches. (I did not go through any details yet.)
 
 TODO
 ----
@@ -246,7 +246,7 @@ def _get_sandwich_arrays(results, cov_type=''):
 
         # experimental support for freq_weights
         if hasattr(results.model, 'freq_weights') and not cov_type == 'clu':
-            # we don't want to square the weights in the covariance calculations
+            # we do not want to square the weights in the covariance calculations
             # assumes that freq_weights are incorporated in score_obs or equivalent
             # assumes xu/score_obs is 2D
             # temporary asarray
diff --git a/statsmodels/stats/stattools.py b/statsmodels/stats/stattools.py
index db9271dad24..b67f906444e 100644
--- a/statsmodels/stats/stattools.py
+++ b/statsmodels/stats/stattools.py
@@ -3,7 +3,7 @@
 
 Notes
 -----
-These functions haven't been formally tested.
+These functions have not been formally tested.
 """
 
 from scipy import stats
@@ -312,7 +312,7 @@ def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True):
     kr2 : ndarray
           Kurtosis estimator based on octiles.
     kr3 : ndarray
-          Kurtosis estimators based on exceedence expectations.
+          Kurtosis estimators based on exceedance expectations.
     kr4 : ndarray
           Kurtosis measure based on the spread between high and low quantiles.
 
diff --git a/statsmodels/stats/tests/test_anova.py b/statsmodels/stats/tests/test_anova.py
index 9ed5eac6d28..6803a88facd 100644
--- a/statsmodels/stats/tests/test_anova.py
+++ b/statsmodels/stats/tests/test_anova.py
@@ -77,7 +77,7 @@ class TestAnovaLM(object):
     @classmethod
     def setup_class(cls):
         # kidney data taken from JT's course
-        # don't know the license
+        # do not know the license
         cls.data = kidney_table
         cls.kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)',
                         data=cls.data).fit()
@@ -100,7 +100,7 @@ class TestAnovaLMNoconstant(object):
     @classmethod
     def setup_class(cls):
         # kidney data taken from JT's course
-        # don't know the license
+        # do not know the license
         cls.data = kidney_table
         cls.kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight) - 1',
                         data=cls.data).fit()
@@ -270,7 +270,7 @@ def test_results(self):
 
 
 class TestAnova2HC0(TestAnovaLM):
-    #NOTE: R doesn't return SSq with robust covariance. Why?
+    #NOTE: R does not return SSq with robust covariance. Why?
     # drop some observations to make an unbalanced, disproportionate panel
     # to make sure things are okay
     def test_results(self):
@@ -407,7 +407,7 @@ def test_results(self):
         np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
 
 class TestAnova3HC0(TestAnovaLM):
-    #NOTE: R doesn't return SSq with robust covariance. Why?
+    #NOTE: R does not return SSq with robust covariance. Why?
     # drop some observations to make an unbalanced, disproportionate panel
     # to make sure things are okay
     def test_results(self):
diff --git a/statsmodels/stats/tests/test_contrast.py b/statsmodels/stats/tests/test_contrast.py
index 4358dd93654..6b16753f1ad 100644
--- a/statsmodels/stats/tests/test_contrast.py
+++ b/statsmodels/stats/tests/test_contrast.py
@@ -36,7 +36,7 @@ def test_contrast3(self):
     def test_estimable(self):
         X2 = np.column_stack((self.X, self.X[:,5]))
         c = Contrast(self.X[:,5],X2)
-        #TODO: I don't think this should be estimable?  isestimable correct?
+        #TODO: I do not think this should be estimable?  isestimable correct?
 
 
 def test_constraints():
diff --git a/statsmodels/stats/tests/test_diagnostic.py b/statsmodels/stats/tests/test_diagnostic.py
index 9d10d728f08..f37b66d5cca 100644
--- a/statsmodels/stats/tests/test_diagnostic.py
+++ b/statsmodels/stats/tests/test_diagnostic.py
@@ -523,7 +523,7 @@ def test_breaks_hansen(self):
         bh = smsdia.breaks_hansen(self.res)
         assert_almost_equal(bh[0], breaks_nyblom_hansen['statistic'],
                             decimal=13)
-        #TODO: breaks_hansen doesn't return pvalues
+        #TODO: breaks_hansen does not return pvalues
 
 
     def test_recursive_residuals(self):
@@ -791,7 +791,7 @@ def test_influence_wrapped():
     gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna()
     lint = d['realint'][:-1]
 
-    # re-index these because they won't conform to lint
+    # re-index these because they will not conform to lint
     gs_l_realgdp.index = lint.index
     gs_l_realinv.index = lint.index
 
diff --git a/statsmodels/stats/tests/test_diagnostic_other.py b/statsmodels/stats/tests/test_diagnostic_other.py
index 3baee55279d..31eec07aede 100644
--- a/statsmodels/stats/tests/test_diagnostic_other.py
+++ b/statsmodels/stats/tests/test_diagnostic_other.py
@@ -85,7 +85,7 @@ def attach_moment_conditions(self):
 
         # weights used for GMM to replicate OLS
         weights = np.linalg.inv(cov_moms)
-        # we don't use last two variables
+        # we do not use last two variables
         weights[:, -k_constraints:] = 0
         weights[-k_constraints:, :] = 0
 
diff --git a/statsmodels/stats/tests/test_influence.py b/statsmodels/stats/tests/test_influence.py
index 81d2e2d44d5..79fc719daa6 100644
--- a/statsmodels/stats/tests/test_influence.py
+++ b/statsmodels/stats/tests/test_influence.py
@@ -162,10 +162,10 @@ def test_r(self):
         # > imI <- influence.measures(glmI)
         # > t(imI$infmat)
 
-        # dfbeta/dfbetas and dffits don't make sense to me and are furthe away from
+        # dfbeta/dfbetas and dffits do not make sense to me and are furthe away from
         # looo than mine
         # resid seem to be resid_deviance based and not resid_pearson
-        # I didn't compare cov.r
+        # I did not compare cov.r
         infl1 = self.infl1
         cooks_d = [0.25220202795934726, 0.26107981497746285, 1.28985614424132389,
                    0.08449722285516942, 0.36362110845918005]
diff --git a/statsmodels/stats/tests/test_pairwise.py b/statsmodels/stats/tests/test_pairwise.py
index 4be52ef860b..1b6ba6b39cb 100644
--- a/statsmodels/stats/tests/test_pairwise.py
+++ b/statsmodels/stats/tests/test_pairwise.py
@@ -264,13 +264,13 @@ def test_incorrect_output(self):
         # just one group
         assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1] * 10)
 
-        # group_order doesn't select all observations, only one group left
+        # group_order does not select all observations, only one group left
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter('always')
             assert_raises(ValueError, MultiComparison, np.array([1] * 10),
                          [1, 2] * 5, group_order=[1])
 
-        # group_order doesn't select all observations,
+        # group_order does not select all observations,
         # we do tukey_hsd with reduced set of observations
         data = np.arange(15)
         groups = np.repeat([1, 2, 3], 5)
diff --git a/statsmodels/stats/tests/test_power.py b/statsmodels/stats/tests/test_power.py
index 93eee9bdd36..af880e0aa48 100644
--- a/statsmodels/stats/tests/test_power.py
+++ b/statsmodels/stats/tests/test_power.py
@@ -489,7 +489,7 @@ def setup_class(cls):
         cls.res2 = res2
         cls.kwds = {'effect_size': res2.d, 'nobs1': res2.n,
                      'alpha': res2.sig_level, 'power':res2.power}
-        # keyword for which we don't look for root:
+        # keyword for which we do not look for root:
         cls.kwds_extra = {'ratio': 0}
 
         cls.cls = smp.NormalIndPower
@@ -513,7 +513,7 @@ def setup_class(cls):
         cls.res2 = res2
         cls.kwds = {'effect_size': res2.d, 'nobs1': res2.n,
                      'alpha': res2.sig_level, 'power':res2.power}
-        # keyword for which we don't look for root:
+        # keyword for which we do not look for root:
         cls.kwds_extra = {'ratio': 0, 'alternative':'smaller'}
 
         cls.cls = smp.NormalIndPower
@@ -537,8 +537,8 @@ def setup_class(cls):
         cls.res2 = res2
         cls.kwds = {'effect_size': res2.w, 'nobs': res2.N,
                      'alpha': res2.sig_level, 'power':res2.power}
-        # keyword for which we don't look for root:
-        # solving for n_bins doesn't work, will not be used in regular usage
+        # keyword for which we do not look for root:
+        # solving for n_bins does not work, will not be used in regular usage
         cls.kwds_extra = {'n_bins': res2.df + 1}
 
         cls.cls = smp.GofChisquarePower
@@ -652,9 +652,9 @@ def setup_class(cls):
         cls.res2 = res2
         cls.kwds = {'effect_size': res2.f, 'nobs': res2.n,
                      'alpha': res2.alpha, 'power': res2.power}
-        # keyword for which we don't look for root:
-        # solving for n_bins doesn't work, will not be used in regular usage
-        cls.kwds_extra = {'k_groups': res2.k} # rootfinding doesn't work
+        # keyword for which we do not look for root:
+        # solving for n_bins does not work, will not be used in regular usage
+        cls.kwds_extra = {'k_groups': res2.k} # rootfinding does not work
         #cls.args_names = ['effect_size','nobs', 'alpha']#, 'k_groups']
         cls.cls = smp.FTestAnovaPower
         # precision for test_power
@@ -680,8 +680,8 @@ def setup_class(cls):
         cls.kwds = {'effect_size': np.sqrt(res2.f2), 'df_num': res2.v,
                      'df_denom': res2.u, 'alpha': res2.sig_level,
                      'power': res2.power}
-        # keyword for which we don't look for root:
-        # solving for n_bins doesn't work, will not be used in regular usage
+        # keyword for which we do not look for root:
+        # solving for n_bins does not work, will not be used in regular usage
         cls.kwds_extra = {}
         cls.args_names = ['effect_size', 'df_num', 'df_denom', 'alpha']
         cls.cls = smp.FTestPower
@@ -727,7 +727,7 @@ def test_power_solver():
     assert_almost_equal(es, 0.01)
 
     # I let this case fail, could be fixed for some statistical tests
-    # (we shouldn't get here in the first place)
+    # (we should not get here in the first place)
     # effect size is negative, but last stage brentq uses [1e-8, 1-1e-8]
     assert_raises(ValueError, nip.solve_power, None, nobs1=1600, alpha=0.01,
                   power=0.005, ratio=1, alternative='larger')
diff --git a/statsmodels/stats/tests/test_proportion.py b/statsmodels/stats/tests/test_proportion.py
index 116329f21d4..4813279e848 100644
--- a/statsmodels/stats/tests/test_proportion.py
+++ b/statsmodels/stats/tests/test_proportion.py
@@ -440,7 +440,7 @@ def test_power_binom_tost():
     power = smprop.power_binom_tost(0.4, 0.6, nobs, p_alt=0.5, alpha=0.05)
     res_power = np.array([ 0.,  0.,  0.,  0.0889,  0.2356,  0.3517,  0.4457,
                            0.6154,  0.6674,  0.7708])
-    # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+    # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
     assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
 def test_power_ztost_prop():
@@ -456,7 +456,7 @@ def test_power_ztost_prop():
 
         res_power = np.array([ 0., 0., 0., 0.0889, 0.2356, 0.4770, 0.5530,
             0.6154,  0.7365,  0.7708])
-        # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+        # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
         assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
         # with critval_continuity correction
@@ -467,7 +467,7 @@ def test_power_ztost_prop():
 
         res_power = np.array([0., 0., 0., 0.0889, 0.2356, 0.3517, 0.4457,
                               0.6154, 0.6674, 0.7708])
-        # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+        # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
         assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
         power = smprop.power_ztost_prop(0.4, 0.6, np.arange(20, 210, 20),
@@ -477,7 +477,7 @@ def test_power_ztost_prop():
 
         res_power = np.array([0., 0., 0., 0.0889, 0.2356, 0.3517, 0.4457,
                               0.6154, 0.6674, 0.7112])
-        # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+        # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
         assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
 def test_ztost():
@@ -508,7 +508,7 @@ def test_power_ztost_prop_norm():
 
     res_power = np.array([0., 0., 0., 0.11450013, 0.27752006, 0.41495922,
                           0.52944621, 0.62382638, 0.70092914, 0.76341806])
-    # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+    # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
     assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
     # regression test for normal distribution
@@ -521,7 +521,7 @@ def test_power_ztost_prop_norm():
     res_power = np.array([0., 0., 0.02667562, 0.20189793, 0.35099606,
                           0.47608598, 0.57981118, 0.66496683, 0.73427591,
                           0.79026127])
-    # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+    # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
     assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
     # regression test for normal distribution
@@ -533,7 +533,7 @@ def test_power_ztost_prop_norm():
 
     res_power = np.array([0., 0., 0., 0.08902071, 0.23582284, 0.35192313,
                           0.55312718, 0.61549537, 0.66743625, 0.77066806])
-    # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+    # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
     assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
     # regression test for normal distribution
@@ -545,7 +545,7 @@ def test_power_ztost_prop_norm():
 
     res_power = np.array([0., 0., 0., 0.08902071, 0.23582284, 0.35192313,
                           0.44588687, 0.61549537, 0.66743625, 0.71115563])
-    # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+    # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
     assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
     # regression test for normal distribution
@@ -557,7 +557,7 @@ def test_power_ztost_prop_norm():
 
     res_power = np.array([0., 0., 0., 0., 0.15851942, 0.41611758,
                           0.5010377, 0.5708047, 0.70328247, 0.74210096])
-    # TODO: I currently don't impose power>=0, i.e np.maximum(power, 0)
+    # TODO: I currently do not impose power>=0, i.e np.maximum(power, 0)
     assert_almost_equal(np.maximum(power, 0), res_power, decimal=4)
 
 
diff --git a/statsmodels/stats/tests/test_weightstats.py b/statsmodels/stats/tests/test_weightstats.py
index 257b8e37422..093205005f3 100644
--- a/statsmodels/stats/tests/test_weightstats.py
+++ b/statsmodels/stats/tests/test_weightstats.py
@@ -4,7 +4,7 @@
 update 2012-09-09:
    added test after fixing bug in covariance
    TODOs:
-     - I don't remember what all the commented out code is doing
+     - I do not remember what all the commented out code is doing
      - should be refactored to use generator or inherited tests
      - still gaps in test coverage
        - value/diff in ttest_ind is tested in test_tost.py
@@ -259,7 +259,7 @@ def test_weightstats_3(self):
         resss = stats.ttest_ind(x1r_2d, x2r_2d)
         assert_almost_equal(ressm[:2], resss, 14)
 
-#        doesn't work for 2d, levene doesn't use weights
+#        does not work for 2d, levene does not use weights
 #        cm = CompareMeans(d1w_2d, d2w_2d)
 #        ressm = cm.test_equal_var()
 #        resss = stats.levene(x1r_2d, x2r_2d)
diff --git a/statsmodels/stats/weightstats.py b/statsmodels/stats/weightstats.py
index 0340e6b06f0..813e8712862 100644
--- a/statsmodels/stats/weightstats.py
+++ b/statsmodels/stats/weightstats.py
@@ -27,7 +27,7 @@
 the user chooses.
 - fixed ddof for the meandiff ttest, now matches scipy.stats.ttest_ind
 
-Note: scipy has now a separate, pooled variance option in ttest, but I haven't
+Note: scipy has now a separate, pooled variance option in ttest, but I have not
 compared yet.
 
 '''
@@ -91,7 +91,7 @@ class DescrStatsW(object):
     array([  1.58414212e-12,   3.87842808e-02,   6.02752170e-01])
     44.0
 
-    #if weiqhts are integers, then asrepeats can be used
+    #if weights are integers, then asrepeats can be used
 
     >>> x1r = d1.asrepeats()
     >>> x1r.shape
@@ -1105,7 +1105,7 @@ def ztost_ind(self, low, upp, usevar='pooled'):
 
     #tost.__doc__ = tost_ind.__doc__
 
-#doesn't work for 2d, doesn't take weights into account
+#does not work for 2d, does not take weights into account
 ##    def test_equal_var(self):
 ##        '''Levene test for independence
 ##
diff --git a/statsmodels/tools/_testing.py b/statsmodels/tools/_testing.py
index 663a92c613a..b1bebf481f3 100644
--- a/statsmodels/tools/_testing.py
+++ b/statsmodels/tools/_testing.py
@@ -133,7 +133,7 @@ def check_ftest_pvalues(results):
     summ = str(res.summary())
     assert_(string_use_t in summ)
 
-    # try except for models that don't have summary2
+    # try except for models that do not have summary2
     try:
         summ2 = str(res.summary2())
     except AttributeError:
diff --git a/statsmodels/tools/eval_measures.py b/statsmodels/tools/eval_measures.py
index 01c6ff35017..b1f60ccc66a 100644
--- a/statsmodels/tools/eval_measures.py
+++ b/statsmodels/tools/eval_measures.py
@@ -405,7 +405,7 @@ def hqic(llf, nobs, df_modelwc):
 
     References
     ----------
-    Wikipedia doesn't say much
+    Wikipedia does not say much
 
     """
     return -2. * llf + 2 * np.log(np.log(nobs)) * df_modelwc
diff --git a/statsmodels/tools/grouputils.py b/statsmodels/tools/grouputils.py
index 87e017f3f7d..457dd281e34 100644
--- a/statsmodels/tools/grouputils.py
+++ b/statsmodels/tools/grouputils.py
@@ -272,7 +272,7 @@ def lag_indices(self, lag):
         individual, then no values for that individual are returned.
 
         TODO: for the unbalanced case, I should get the same truncation for
-        the array with lag=0. From the return of lag_idx we wouldn't know
+        the array with lag=0. From the return of lag_idx we would not know
         which individual is missing.
 
         TODO: do I want the full equivalent of lagmat in tsa?
@@ -383,7 +383,7 @@ def reindex(self, index=None, names=None):
         """
         Resets the index in-place.
         """
-        # NOTE: this isn't of much use if the rest of the data doesn't change
+        # NOTE: this is not of much use if the rest of the data does not change
         # This needs to reset cache
         if names is None:
             names = self.group_names
@@ -492,7 +492,7 @@ def transform_slices(self, array, function, level=0, **kwargs):
         processed = np.array(processed)
         return processed.reshape(-1, processed.shape[-1])
 
-    # TODO: this isn't general needs to be a PanelGrouping object
+    # TODO: this is not general needs to be a PanelGrouping object
     def dummies_time(self):
         self.dummy_sparse(level=1)
         return self._dummies
diff --git a/statsmodels/tools/numdiff.py b/statsmodels/tools/numdiff.py
index 199b2121f27..fee4775927c 100644
--- a/statsmodels/tools/numdiff.py
+++ b/statsmodels/tools/numdiff.py
@@ -8,7 +8,7 @@
 These are simple forward differentiation, so that we have them available
 without dependencies.
 
-* Jacobian should be faster than numdifftools because it doesn't use loop over
+* Jacobian should be faster than numdifftools because it does not use loop over
   observations.
 * numerical precision will vary and depend on the choice of stepsizes
 """
diff --git a/statsmodels/tools/parallel.py b/statsmodels/tools/parallel.py
index 0354832f09a..96ed385b7f6 100644
--- a/statsmodels/tools/parallel.py
+++ b/statsmodels/tools/parallel.py
@@ -6,7 +6,7 @@
 License: Simplified BSD
 
 changes for statsmodels (Josef Perktold)
-- try import from joblib directly, (doesn't import all of sklearn)
+- try import from joblib directly, (does not import all of sklearn)
 
 """
 
diff --git a/statsmodels/tools/rootfinding.py b/statsmodels/tools/rootfinding.py
index ad7a91faf2a..7a26f2dd8f2 100644
--- a/statsmodels/tools/rootfinding.py
+++ b/statsmodels/tools/rootfinding.py
@@ -124,7 +124,7 @@ def brentq_expanding(func, low=None, upp=None, args=(), xtol=1e-5,
 
         # special case for F-distribution (symmetric around zero for effect
         # size)
-        # chisquare also takes an indefinite time (didn't wait see if it
+        # chisquare also takes an indefinite time (did not wait see if it
         # returns)
         if np.max(np.abs(f_upp - f_low)) < 1e-15 and sl == -1 and su == 1:
             sl = 1e-8
@@ -137,7 +137,7 @@ def brentq_expanding(func, low=None, upp=None, args=(), xtol=1e-5,
         delta = su - sl
         if np.isnan(f_low):
             # try just 3 points to find ``increasing``
-            # don't change sl because brentq can handle one nan bound
+            # do not change sl because brentq can handle one nan bound
             for fraction in [0.25, 0.5, 0.75]:
                 sl_ = sl + fraction * delta
                 f_low = func(sl_, *args)
diff --git a/statsmodels/tools/tests/test_numdiff.py b/statsmodels/tools/tests/test_numdiff.py
index d3fd20ba695..1a69ce6f760 100644
--- a/statsmodels/tools/tests/test_numdiff.py
+++ b/statsmodels/tools/tests/test_numdiff.py
@@ -1,7 +1,7 @@
 '''Testing numerical differentiation
 
 Still some problems, with API (args tuple versus *args)
-finite difference Hessian has some problems that I didn't look at yet
+finite difference Hessian has some problems that I did not look at yet
 
 Should Hessian also work per observation, if fun returns 2d
 
@@ -113,7 +113,7 @@ def test_hess(self):
             assert_almost_equal(he, hefd, decimal=DEC8)
 
             #NOTE: notice the accuracy below and the epsilon changes
-            # this doesn't work well for score -> hessian with non-cs step
+            # this does not work well for score -> hessian with non-cs step
             # it's a little better around the optimum
             assert_almost_equal(he, hefd, decimal=7)
             hefd = numdiff.approx_fprime(test_params, self.mod.score,
@@ -128,7 +128,7 @@ def test_hess(self):
 
             hecs = numdiff.approx_hess_cs(test_params, self.mod.loglike)
             assert_almost_equal(he, hecs, decimal=5)
-            #NOTE: these just don't work well
+            #NOTE: these just do not work well
             #hecs = numdiff.approx_hess1(test_params, self.mod.loglike, 1e-3)
             #assert_almost_equal(he, hecs, decimal=1)
             #hecs = numdiff.approx_hess2(test_params, self.mod.loglike, 1e-4)
@@ -211,7 +211,7 @@ def test_hess_fun1_fd(self):
         for test_params in self.params:
             #hetrue = 0
             hetrue = self.hesstrue(test_params)
-            if hetrue is not None: #Hessian doesn't work for 2d return of fun
+            if hetrue is not None: #Hessian does not work for 2d return of fun
                 fun = self.fun()
                 #default works, epsilon 1e-6 or 1e-8 is not precise enough
                 hefd = numdiff.approx_hess1(test_params, fun, #epsilon=1e-8,
@@ -233,7 +233,7 @@ def test_hess_fun1_cs(self):
         for test_params in self.params:
             #hetrue = 0
             hetrue = self.hesstrue(test_params)
-            if hetrue is not None: #Hessian doesn't work for 2d return of fun
+            if hetrue is not None: #Hessian does not work for 2d return of fun
                 fun = self.fun()
                 hecs = numdiff.approx_hess_cs(test_params, fun, args=self.args)
                 assert_almost_equal(hetrue, hecs, decimal=DEC6)
@@ -330,7 +330,7 @@ def f(x):
     print(numdiff.approx_hess(xk,fun2,1e-3, (y,x))[0] - 2*np.dot(x.T, x))
 
     gt = (-x*2*(y-np.dot(x, [1,2,3]))[:,None])
-    g = approx_fprime_cs((1,2,3), fun1, (y,x), h=1.0e-20)#.T   #this shouldn't be transposed
+    g = approx_fprime_cs((1,2,3), fun1, (y,x), h=1.0e-20)#.T   #this should not be transposed
     gd = numdiff.approx_fprime((1,2,3),fun1,epsilon,(y,x))
     print(maxabs(g, gt))
     print(maxabs(gd, gt))
@@ -345,7 +345,7 @@ def f(x):
     score = mod.score
     hess = mod.hessian
 
-    #cs doesn't work for Probit because special.ndtr doesn't support complex
+    #cs does not work for Probit because special.ndtr does not support complex
     #maybe calculating ndtr for real and imag parts separately, if we need it
     #and if it still works in this case
     print('sm', score(test_params))
diff --git a/statsmodels/tools/tests/test_rootfinding.py b/statsmodels/tools/tests/test_rootfinding.py
index cceaac3fdd2..9d7eb475964 100644
--- a/statsmodels/tools/tests/test_rootfinding.py
+++ b/statsmodels/tools/tests/test_rootfinding.py
@@ -54,12 +54,12 @@ def test_brentq_expanding():
             assert_allclose(res, a, rtol=1e-5)
 
     # wrong sign for start bounds
-    # doesn't raise yet during development TODO: activate this
+    # does not raise yet during development TODO: activate this
     # it kind of works in some cases, but not correctly or in a useful way
     #assert_raises(ValueError, brentq_expanding, func, args=(-500,), start_upp=-1000)
     #assert_raises(ValueError, brentq_expanding, func, args=(500,), start_low=1000)
 
-    # low upp given, but doesn't bound root, leave brentq exception
+    # low upp given, but does not bound root, leave brentq exception
     # ValueError: f(a) and f(b) must have different signs
     assert_raises(ValueError, brentq_expanding, funcn, args=(-50000,), low= -40000, upp=-10000)
 
diff --git a/statsmodels/tools/tests/test_transform_model.py b/statsmodels/tools/tests/test_transform_model.py
index b6ef06024bb..23c08ae421e 100644
--- a/statsmodels/tools/tests/test_transform_model.py
+++ b/statsmodels/tools/tests/test_transform_model.py
@@ -34,7 +34,7 @@ def test_standardize1():
     assert_allclose(xs4, (2*x - transf.mean) / transf.scale,
                     rtol=1e-13, atol=1e-20)
 
-    # affine transform doesn't change standardized
+    # affine transform does not change standardized
     x2 = 2 * x + np.random.randn(4)
     transf2 = StandardizeTransform(x2)
     xs3 = transf2(x2)
diff --git a/statsmodels/tools/tools.py b/statsmodels/tools/tools.py
index 374fd0bdf33..4f0a04f8053 100644
--- a/statsmodels/tools/tools.py
+++ b/statsmodels/tools/tools.py
@@ -69,7 +69,7 @@ def drop_missing(Y, X=None, axis=1):
 
 
 # TODO: needs to better preserve dtype and be more flexible
-# ie., if you still have a string variable in your array you don't
+# ie., if you still have a string variable in your array you do not
 # want to cast it to float
 # TODO: add name validator (ie., bad names for datasets.grunfeld)
 def categorical(data, col=None, dictnames=False, drop=False):
diff --git a/statsmodels/tsa/_stl.pyx b/statsmodels/tsa/_stl.pyx
index 930fdc188ee..707af12a697 100644
--- a/statsmodels/tsa/_stl.pyx
+++ b/statsmodels/tsa/_stl.pyx
@@ -156,7 +156,7 @@ cdef class STL(object):
 
     Examples
     --------
-    The original example uses STL to decompos CO2 data into level, season and a
+    The original example uses STL to decompose CO2 data into level, season and a
     residual.
 
     Start by aggregating to monthly, and filling any missing values
diff --git a/statsmodels/tsa/ar_model.py b/statsmodels/tsa/ar_model.py
index e0191cbc4a8..64f11dda747 100644
--- a/statsmodels/tsa/ar_model.py
+++ b/statsmodels/tsa/ar_model.py
@@ -36,7 +36,7 @@ def _ar_predict_out_of_sample(y, params, k_ar, k_trend, steps, start=0):
     arparams = params[k_trend:][::-1]  # reverse for dot
 
     # dynamic endogenous variable
-    endog = np.zeros(k_ar + steps)  # this is one too big but doesn't matter
+    endog = np.zeros(k_ar + steps)  # this is one too big but does not matter
     if start:
         endog[:k_ar] = y[start-k_ar:start]
     else:
@@ -135,7 +135,7 @@ def _get_prediction_index(self, start, end, dynamic, index=None):
         if start is None:
             if method == 'mle' and not dynamic:
                 start = 0
-            else:  # can't do presample fit for cmle or dynamic
+            else:  # cannot do presample fit for cmle or dynamic
                 start = k_ar
             start = self._index[start]
         if end is None:
@@ -581,12 +581,12 @@ def fit(self, maxlag=None, method='cmle', ic=None, trend='c',
                 params = self._transparams(params)
                 self.transparams = False  # turn off now for other results
 
-        # don't use yw, because we can't estimate the constant
+        # do not use yw, because we cannot estimate the constant
         #elif method == "yw":
         #    params, omega = yule_walker(endog, order=maxlag,
         #            method="mle", demean=False)
         #    # how to handle inference after Yule-Walker?
-        #    self.params = params #TODO: don't attach here
+        #    self.params = params #TODO: do not attach here
         #    self.omega = omega
 
         pinv_exog = np.linalg.pinv(X)
@@ -874,7 +874,7 @@ class ARResultsWrapper(wrap.ResultsWrapper):
                               ts_dr.toordinal().astype(int)))
     sunspots = pandas.Series(sunspots.endog, index=dt_dates)
 
-    #NOTE: pandas can't handle pre-1900 dates
+    #NOTE: pandas cannot handle pre-1900 dates
     mod = AR(sunspots, freq='A')
     res = mod.fit(method='mle', maxlag=9)
 
diff --git a/statsmodels/tsa/arima_model.py b/statsmodels/tsa/arima_model.py
index 879ae10adc9..634568fc5f8 100644
--- a/statsmodels/tsa/arima_model.py
+++ b/statsmodels/tsa/arima_model.py
@@ -284,7 +284,7 @@ def _get_predict_out_of_sample(endog, p, q, k_trend, k_exog, start, errors,
             mu = np.array([mu]*steps)
     elif k_exog > 0:
         X = np.dot(exog, exparams)
-        #NOTE: you shouldn't have to give in-sample exog!
+        #NOTE: you should not have to give in-sample exog!
         X = lagmat(X, p, original='in', trim='both')
         mu = (np.r_[1, -arparams[::-1]] * X).sum(1)[:, None]
     else:
@@ -480,7 +480,7 @@ def _fit_start_params_hr(self, order, start_ar_lags=None):
             endog -= np.dot(exog, ols_params).squeeze()
         if q != 0:
             if p != 0:
-                # make sure we don't run into small data problems in AR fit
+                # make sure we do not run into small data problems in AR fit
                 nobs = len(endog)
                 if start_ar_lags is None:
                     maxlag = int(round(12*(nobs/100.)**(1/4.)))
@@ -494,7 +494,7 @@ def _fit_start_params_hr(self, order, start_ar_lags=None):
                 arcoefs_tmp = armod.params
                 p_tmp = armod.k_ar
                 # it's possible in small samples that optimal lag-order
-                # doesn't leave enough obs. No consistent way to fix.
+                # does not leave enough obs. No consistent way to fix.
                 if p_tmp + q >= len(endog):
                     raise ValueError("Proper starting parameters cannot"
                                      " be found for this order with this "
@@ -699,7 +699,7 @@ def geterrors(self, params):
         return errors.squeeze()
 
     def predict(self, params, start=None, end=None, exog=None, dynamic=False):
-        method = getattr(self, 'method', 'mle')  # don't assume fit
+        method = getattr(self, 'method', 'mle')  # do not assume fit
         #params = np.asarray(params)
 
         # will return an index of a date
@@ -949,7 +949,7 @@ def fit(self, start_params=None, trend='c', method="css-mle",
         if transparams:  # transform parameters back
             params = self._transparams(params)
 
-        self.transparams = False  # so methods don't expect transf.
+        self.transparams = False  # so methods do not expect transf.
 
         normalized_cov_params = None  # TODO: fix this
         armafit = ARMAResults(self, params, normalized_cov_params)
@@ -1431,7 +1431,7 @@ def llf(self):
     def bse(self):
         params = self.params
         hess = self.model.hessian(params)
-        if len(params) == 1:  # can't take an inverse, ensure 1d
+        if len(params) == 1:  # cannot take an inverse, ensure 1d
             return np.sqrt(-1./hess[0])
         return np.sqrt(np.diag(-inv(hess)))
 
diff --git a/statsmodels/tsa/arma_mle.py b/statsmodels/tsa/arma_mle.py
index 86fd8622fa6..d9d22b67578 100644
--- a/statsmodels/tsa/arma_mle.py
+++ b/statsmodels/tsa/arma_mle.py
@@ -92,7 +92,7 @@ def loglike(self, params):
         sigma2 = np.maximum(params[-1]**2, 1e-6)
         axis = 0
         nobs = len(errorsest)
-        #this doesn't help for exploding paths
+        #this does not help for exploding paths
         #errorsest[np.isnan(errorsest)] = 100
 #        llike  =  -0.5 * (np.sum(np.log(sigma2),axis)
 #                          + np.sum((errorsest**2)/sigma2, axis)
@@ -122,7 +122,7 @@ def nloglikeobs(self, params):
         sigma2 = np.maximum(params[-1]**2, 1e-6)
         axis = 0
         nobs = len(errorsest)
-        #this doesn't help for exploding paths
+        #this does not help for exploding paths
         #errorsest[np.isnan(errorsest)] = 100
 #        llike  =  -0.5 * (np.sum(np.log(sigma2),axis)
 #                          + np.sum((errorsest**2)/sigma2, axis)
@@ -223,7 +223,7 @@ def fit(self, order=(0,0), start_params=None, method="ls", **optkwds):
         elif method == "ssm":
             pass
         else:  #this is also conditional least squares
-            # fmin_bfgs is slow or doesn't work yet
+            # fmin_bfgs is slow or does not work yet
             errfnsum = lambda rho : np.sum(self.geterrors(rho)**2)
             #xopt, {fopt, gopt, Hopt, func_calls, grad_calls
             optim_kwds = dict(maxiter=2, full_output=True)
@@ -270,7 +270,7 @@ def fit_mle(self, order=(0,0), start_params=None, method='nm', maxiter=5000, tol
             start_params = np.concatenate((0.05*np.ones(nar + nma), [1]))
         mlefit = super(Arma, self).fit(start_params=start_params,
                 maxiter=maxiter, method=method, tol=tol, **kwds)
-        #bug fix: running ls and then mle didn't overwrite this
+        #bug fix: running ls and then mle did not overwrite this
         rh = mlefit.params
         self.params = rh
         self.ar_est = np.concatenate(([1], -rh[:p]))
diff --git a/statsmodels/tsa/base/tests/test_base.py b/statsmodels/tsa/base/tests/test_base.py
index 30236486bfd..bcbcd21307e 100644
--- a/statsmodels/tsa/base/tests/test_base.py
+++ b/statsmodels/tsa/base/tests/test_base.py
@@ -19,7 +19,7 @@ def test_pandas_nodates_index():
     # TODO: Remove this, this is now valid
     # npt.assert_raises(ValueError, TimeSeriesModel, s)
 
-    # Test with a non-date index that doesn't raise an exception because it
+    # Test with a non-date index that does not raise an exception because it
     # can be coerced into a nanosecond DatetimeIndex
     data = [988, 819, 964]
     # index=pd.date_range('1970-01-01', periods=3, freq='QS')
diff --git a/statsmodels/tsa/base/tests/test_tsa_indexes.py b/statsmodels/tsa/base/tests/test_tsa_indexes.py
index 8811d05e0d4..94e85d52d87 100644
--- a/statsmodels/tsa/base/tests/test_tsa_indexes.py
+++ b/statsmodels/tsa/base/tests/test_tsa_indexes.py
@@ -141,7 +141,7 @@ def test_instantiation_valid():
     # 2. Int64Index with values exactly equal to 0, 1, ..., nobs-1
     # 3. DatetimeIndex with frequency
     # 4. PeriodIndex with frequency
-    # 5. Anything that doesn't fall into the above categories also should
+    # 5. Anything that does not fall into the above categories also should
     #    only raise an exception if it was passed to dates, and may trigger
     #    a warning otherwise.
     #
@@ -156,13 +156,13 @@ def test_instantiation_valid():
     # 8. Series of date strings (requires freq)
     # 9. Series of datetime objects (requires freq)
     # 10. Series of pandas timestamps (requires freq)
-    # 11. Anything that doesn't fall into the above categories should raise
+    # 11. Anything that does not fall into the above categories should raise
     #     an exception.
     #
     # `freq` can be:
     # 0. None
     # 1. Something that can be passed to `pd.to_offset`
-    # 2. Anything that can't should raise an Exception
+    # 2. Anything that cannot should raise an Exception
     #
     # Each test will be denoted by:
     # endog.index:exog.index/date/freq where the corresponding
@@ -297,7 +297,7 @@ def test_instantiation_valid():
                 assert_equal(mod.data.freq, freq)
 
         # Increment index (this is a "supported" index in the sense that it
-        # doesn't raise a warning, but obviously not a date index)
+        # does not raise a warning, but obviously not a date index)
         endog = base_endog.copy()
         endog.index = supported_increment_indexes[0][0]
 
@@ -393,13 +393,13 @@ def test_instantiation_valid():
                 assert_equal(mod.data.dates.equals(mod._index), True)
 
                 # Note: here, we need to hedge the test a little bit because
-                # inferred frequencies aren't always the same as the original
+                # inferred frequencies are not always the same as the original
                 # frequency. From the examples above, when the actual freq is
                 # 2QS-OCT, the inferred freq is 2QS-JAN. This is an issue with
                 # inferred frequencies, but since we are warning the user, it's
                 # not a failure of the code. Thus we only test the "major" part
                 # of the freq, and just test that the right message is given
-                # (even though it won't have the actual freq of the data in
+                # (even though it will not have the actual freq of the data in
                 # it).
                 assert_equal(mod.data.freq.split('-')[0], freq.split('-')[0])
                 assert_equal(str(w[-1].message), message % mod.data.freq)
diff --git a/statsmodels/tsa/base/tsa_model.py b/statsmodels/tsa/base/tsa_model.py
index 224a245ac47..67b8b048865 100644
--- a/statsmodels/tsa/base/tsa_model.py
+++ b/statsmodels/tsa/base/tsa_model.py
@@ -98,12 +98,12 @@ def _init_dates(self, dates=None, freq=None):
         else:
             index = self.data.row_labels
 
-        # Sanity check that we don't have a `freq` without an index
+        # Sanity check that we do not have a `freq` without an index
         if index is None and freq is not None:
             raise ValueError('Frequency provided without associated index.')
 
         # If an index is available, see if it is a date-based index or if it
-        # can be coerced to one. (If it can't we'll fall back, below, to an
+        # can be coerced to one. (If it cannot we'll fall back, below, to an
         # internal, 0, 1, ... nobs-1 integer index for modeling purposes)
         inferred_freq = False
         if index is not None:
@@ -116,7 +116,7 @@ def _init_dates(self, dates=None, freq=None):
                     # object dtype array in earlier versions of Pandas (and so
                     # will not have is_numeric_dtype == True), so explicitly
                     # check for it here. But note also that in very early
-                    # Pandas (~0.12), Float64Index doesn't exist (and so the
+                    # Pandas (~0.12), Float64Index does not exist (and so the
                     # Statsmodels compat makes it an empty tuple, so in that
                     # case also check if the first element is a float.
                     _index = np.asarray(index)
@@ -142,7 +142,7 @@ def _init_dates(self, dates=None, freq=None):
                     index = _index
                 except:
                     # Only want to actually raise an exception if `dates` was
-                    # provided but can't be coerced. If we got the index from
+                    # provided but cannot be coerced. If we got the index from
                     # the row_labels, we'll just ignore it and use the integer
                     # index below
                     if dates is not None:
@@ -257,9 +257,9 @@ def _get_index_loc(self, key, base_index=None):
             The location of the key
         index : pd.Index
             The index including the key; this is a copy of the original index
-            unless the index had to be expanded to accomodate `key`.
+            unless the index had to be expanded to accommodate `key`.
         index_was_expanded : bool
-            Whether or not the index was expanded to accomodate `key`.
+            Whether or not the index was expanded to accommodate `key`.
 
         Notes
         -----
@@ -404,9 +404,9 @@ def _get_index_label_loc(self, key, base_index=None):
             The location of the key
         index : pd.Index
             The index including the key; this is a copy of the original index
-            unless the index had to be expanded to accomodate `key`.
+            unless the index had to be expanded to accommodate `key`.
         index_was_expanded : bool
-            Whether or not the index was expanded to accomodate `key`.
+            Whether or not the index was expanded to accommodate `key`.
 
         Notes
         -----
diff --git a/statsmodels/tsa/coint_tables.py b/statsmodels/tsa/coint_tables.py
index ce20b7b49e3..0bf084b6a43 100644
--- a/statsmodels/tsa/coint_tables.py
+++ b/statsmodels/tsa/coint_tables.py
@@ -108,7 +108,7 @@ def c_sja(n, p):
 % ------------------------------------------------------------
 % USAGE:  jc = c_sjt(n,p)
 % where:    n = dimension of the VAR system
-%               NOTE: routine doesn't work for n > 12
+%               NOTE: routine does not work for n > 12
 %           p = order of time polynomial in the null-hypothesis
 %                 p = -1, no deterministic part
 %                 p =  0, for constant term
diff --git a/statsmodels/tsa/descriptivestats.py b/statsmodels/tsa/descriptivestats.py
index 7d550c8d7f9..fc1f563830b 100644
--- a/statsmodels/tsa/descriptivestats.py
+++ b/statsmodels/tsa/descriptivestats.py
@@ -46,7 +46,7 @@ def pacf(self, nlags=40):
         return stt.pacf(self.data, nlags=nlags)
 
     def periodogram(self):
-        #doesn't return frequesncies
+        #does not return frequesncies
         return stt.periodogram(self.data)
 
     # copied from fftarma.py
diff --git a/statsmodels/tsa/filters/filtertools.py b/statsmodels/tsa/filters/filtertools.py
index 2d41fa7680b..188ea7657f0 100644
--- a/statsmodels/tsa/filters/filtertools.py
+++ b/statsmodels/tsa/filters/filtertools.py
@@ -44,7 +44,7 @@ def _pad_nans(x, head=None, tail=None):
 
 #original changes and examples in sandbox.tsa.try_var_convolve
 
-# don't do these imports, here just for copied fftconvolve
+# do not do these imports, here just for copied fftconvolve
 #get rid of these imports
 #from scipy.fftpack import fft, ifft, ifftshift, fft2, ifft2, fftn, \
 #     ifftn, fftfreq
@@ -56,12 +56,12 @@ def fftconvolveinv(in1, in2, mode="full"):
     """Convolve two N-dimensional arrays using FFT. See convolve.
 
     copied from scipy.signal.signaltools, but here used to try out inverse filter
-    doesn't work or I can't get it to work
+    does not work or I cannot get it to work
 
     2010-10-23:
     looks ok to me for 1d,
     from results below with padded data array (fftp)
-    but it doesn't work for multidimensional inverse filter (fftn)
+    but it does not work for multidimensional inverse filter (fftn)
     original signal.fftconvolve also uses fftn
 
     """
@@ -77,7 +77,7 @@ def fftconvolveinv(in1, in2, mode="full"):
     #IN1 *= fftn(in2,fsize) #JP: this looks like the only change I made
     IN1 /= fft.fftn(in2,fsize)  # use inverse filter
     # note the inverse is elementwise not matrix inverse
-    # is this correct, NO  doesn't seem to work for VARMA
+    # is this correct, NO  does not seem to work for VARMA
     fslice = tuple([slice(0, int(sz)) for sz in size])
     ret = fft.ifftn(IN1)[fslice].copy()
     del IN1
@@ -106,12 +106,12 @@ def fftconvolve3(in1, in2=None, in3=None, mode="full"):
       since I'm using max of in2, in3 shapes and not the sum
 
     copied from scipy.signal.signaltools, but here used to try out inverse
-    filter doesn't work or I can't get it to work
+    filter does not work or I cannot get it to work
 
     2010-10-23
     looks ok to me for 1d,
     from results below with padded data array (fftp)
-    but it doesn't work for multidimensional inverse filter (fftn)
+    but it does not work for multidimensional inverse filter (fftn)
     original signal.fftconvolve also uses fftn
     """
     if (in2 is None) and (in3 is None):
@@ -138,7 +138,7 @@ def fftconvolve3(in1, in2=None, in3=None, mode="full"):
     if in3 is not None:
         IN1 /= fft.fftn(in3, fsize)  # use inverse filter
     # note the inverse is elementwise not matrix inverse
-    # is this correct, NO  doesn't seem to work for VARMA
+    # is this correct, NO  does not seem to work for VARMA
     IN1 *= fft.fftn(in1, fsize)
     fslice = tuple([slice(0, int(sz)) for sz in size])
     ret = fft.ifftn(IN1)[fslice].copy()
diff --git a/statsmodels/tsa/filters/hp_filter.py b/statsmodels/tsa/filters/hp_filter.py
index 63fd993c841..48e5f7786b4 100644
--- a/statsmodels/tsa/filters/hp_filter.py
+++ b/statsmodels/tsa/filters/hp_filter.py
@@ -75,7 +75,7 @@ def hpfilter(x, lamb=1600):
     References
     ----------
     Hodrick, R.J, and E. C. Prescott. 1980. "Postwar U.S. Business Cycles: An
-        Empricial Investigation." `Carnegie Mellon University discussion
+        Empirical Investigation." `Carnegie Mellon University discussion
         paper no. 451`.
     Ravn, M.O and H. Uhlig. 2002. "Notes On Adjusted the Hodrick-Prescott
         Filter for the Frequency of Observations." `The Review of Economics and
diff --git a/statsmodels/tsa/holtwinters.py b/statsmodels/tsa/holtwinters.py
index 3f19857a1a6..5c0a53720cb 100644
--- a/statsmodels/tsa/holtwinters.py
+++ b/statsmodels/tsa/holtwinters.py
@@ -348,7 +348,7 @@ def forecast(self, steps=1):
             end = self.model._index[-1] + steps * freq
             return self.model.predict(self.params, start=start, end=end)
         except (AttributeError, ValueError):
-            # May occur when the index doesn't have a freq
+            # May occur when the index does not have a freq
             return self.model._predict(h=steps, **self.params).fcastvalues
 
     def summary(self):
diff --git a/statsmodels/tsa/kalmanf/kalman_loglike.pyx b/statsmodels/tsa/kalmanf/kalman_loglike.pyx
index 70169e8db0b..c4bfa6285cf 100644
--- a/statsmodels/tsa/kalmanf/kalman_loglike.pyx
+++ b/statsmodels/tsa/kalmanf/kalman_loglike.pyx
@@ -86,7 +86,7 @@ def kalman_filter_double(double[:] y not None,
         double alph = 1.0
         double beta = 0.0
 
-    # NOTE: not sure about just checking F_mat[0, 0], didn't appear to work
+    # NOTE: not sure about just checking F_mat[0, 0], did not appear to work
     while not F_mat == 1. and i < nobs:
         # Predict
         # Z_mat is just a selector matrix
diff --git a/statsmodels/tsa/kalmanf/kalmanfilter.py b/statsmodels/tsa/kalmanf/kalmanfilter.py
index 0bd46720ca2..9e63fb0c253 100644
--- a/statsmodels/tsa/kalmanf/kalmanfilter.py
+++ b/statsmodels/tsa/kalmanf/kalmanfilter.py
@@ -172,7 +172,7 @@ def _init_kalman_state(cls, params, arma_model):
         if arma_model.transparams:
             newparams = arma_model._transparams(params)
         else:
-            newparams = params  # don't need a copy if not modified.
+            newparams = params  # do not need a copy if not modified.
 
         if k > 0:
             y -= np.dot(arma_model.exog, newparams[:k])
@@ -211,7 +211,7 @@ def loglike(cls, params, arma_model, set_sigma2=True):
         """
         # TODO: see section 3.4.6 in Harvey for computing the derivatives in
         # the recursion itself.
-        # TODO: this won't work for time-varying parameters
+        # TODO: this will not work for time-varying parameters
         (y, k, nobs, k_ar, k_ma, k_lags, newparams, Z_mat, m, R_mat, T_mat,
          paramsdtype) = cls._init_kalman_state(params, arma_model)
         if np.issubdtype(paramsdtype, np.float64):
diff --git a/statsmodels/tsa/regime_switching/markov_switching.py b/statsmodels/tsa/regime_switching/markov_switching.py
index e101c26be99..922cbf6919c 100644
--- a/statsmodels/tsa/regime_switching/markov_switching.py
+++ b/statsmodels/tsa/regime_switching/markov_switching.py
@@ -913,7 +913,7 @@ def smooth(self, params, transformed=True, cov_type=None, cov_kwds=None,
         self.data.param_names = self.param_names
 
         # Hamilton filter
-        # TODO add option to filter to return logged values so that we don't
+        # TODO add option to filter to return logged values so that we do not
         # need to re-log them for smoother
         names = ['regime_transition', 'initial_probabilities',
                  'conditional_loglikelihoods',
diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py b/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py
index df257b952b3..b5c228df114 100644
--- a/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py
+++ b/statsmodels/tsa/regime_switching/tests/test_markov_autoregression.py
@@ -518,7 +518,7 @@ def test_smoothed_regimes(self):
                         hamilton_ar4_smoothed, atol=1e-5)
 
     def test_bse(self):
-        # Can't compare middle element of bse because we estimate sigma^2
+        # Cannot compare middle element of bse because we estimate sigma^2
         # rather than sigma
         bse = self.result.cov_params_approx.diagonal()**0.5
         assert_allclose(bse[:4], self.true['bse_oim'][:4], atol=1e-6)
@@ -574,7 +574,7 @@ def test_predict(self):
         assert_allclose(actual, self.true['predict_smoothed'], atol=1e-6)
 
     def test_bse(self):
-        # Can't compare middle element of bse because we estimate sigma^2
+        # Cannot compare middle element of bse because we estimate sigma^2
         # rather than sigma
         bse = self.result.cov_params_approx.diagonal()**0.5
         assert_allclose(bse[:4], self.true['bse_oim'][:4], atol=1e-7)
diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py
index a02e871bf2b..e56299ec419 100644
--- a/statsmodels/tsa/regime_switching/tests/test_markov_regression.py
+++ b/statsmodels/tsa/regime_switching/tests/test_markov_regression.py
@@ -815,7 +815,7 @@ def test_predict(self):
         assert_allclose(actual, self.true['predict_smoothed'], atol=1e-6)
 
     def test_bse(self):
-        # Can't compare last element of bse because we estimate sigma^2 rather
+        # Cannot compare last element of bse because we estimate sigma^2 rather
         # than sigma^2
         bse = self.result.cov_params_approx.diagonal()**0.5
         assert_allclose(bse[:-1], self.true['bse_oim'][:-1], atol=1e-7)
@@ -948,7 +948,7 @@ def test_hamilton_filter_order_zero(self):
         nobs = 4
         initial_probabilities = np.ones(k_regimes) / k_regimes
 
-        # We don't actually transition between the 3 regimes.
+        # We do not actually transition between the 3 regimes.
         regime_transition = np.eye(k_regimes)[:, :, np.newaxis]
 
         # Regime i correponds to a sequence of iid draws from discrete
@@ -973,7 +973,7 @@ def test_hamilton_filter_order_zero_with_tvtp(self):
         nobs = 8
         initial_probabilities = np.ones(k_regimes) / k_regimes
 
-        # We don't actually transition between the 3 regimes except from
+        # We do not actually transition between the 3 regimes except from
         # t=3 to t=4 where we reset to regimes 1 and 2 being equally
         # likely.
         regime_transition = np.zeros((k_regimes, k_regimes, nobs))
@@ -1048,7 +1048,7 @@ def setup_class(cls):
             true, fedfunds[1:], k_regimes=2, exog=fedfunds[:-1])
 
     def test_bse(self):
-        # Can't compare last element of bse because we estimate sigma^2 rather
+        # Cannot compare last element of bse because we estimate sigma^2 rather
         # than sigma^2
         bse = self.result.cov_params_approx.diagonal()**0.5
         assert_allclose(bse[:-1], self.true['bse_oim'][:-1], atol=1e-6)
@@ -1110,7 +1110,7 @@ def test_predict(self):
         assert_allclose(actual, self.true['predict_smoothed'], atol=1e-5)
 
     def test_bse(self):
-        # Can't compare last element of bse because we estimate sigma^2 rather
+        # Cannot compare last element of bse because we estimate sigma^2 rather
         # than sigma^2
         bse = self.result.cov_params_approx.diagonal()**0.5
         assert_allclose(bse[:-1], self.true['bse_oim'][:-1], atol=1e-7)
@@ -1165,7 +1165,7 @@ def test_fit(self, **kwargs):
         super(TestAreturnsConstL1Variance, self).test_fit(**kwargs)
 
     def test_bse(self):
-        # Can't compare last two element of bse because we estimate sigma^2
+        # Cannot compare last two element of bse because we estimate sigma^2
         # rather than sigma
         bse = self.result.cov_params_approx.diagonal()**0.5
         assert_allclose(bse[:-2], self.true['bse_oim'][:-2], atol=1e-7)
diff --git a/statsmodels/tsa/regime_switching/tests/test_markov_switching.py b/statsmodels/tsa/regime_switching/tests/test_markov_switching.py
index a6e67078815..de2e1f2989f 100644
--- a/statsmodels/tsa/regime_switching/tests/test_markov_switching.py
+++ b/statsmodels/tsa/regime_switching/tests/test_markov_switching.py
@@ -190,7 +190,7 @@ def test_initial_probabilities():
     # Invalid known initial probabilities (too many elements)
     assert_raises(ValueError, mod.initialize_known, [0.2, 0.2, 0.6])
 
-    # Invalid known initial probabilities (doesn't sum to 1)
+    # Invalid known initial probabilities (does not sum to 1)
     assert_raises(ValueError, mod.initialize_known, [0.2, 0.2])
 
     # Valid steady-state probabilities
diff --git a/statsmodels/tsa/statespace/_filters/_conventional.pyx.in b/statsmodels/tsa/statespace/_filters/_conventional.pyx.in
index 8f307d161a2..8287a7d8045 100644
--- a/statsmodels/tsa/statespace/_filters/_conventional.pyx.in
+++ b/statsmodels/tsa/statespace/_filters/_conventional.pyx.in
@@ -86,7 +86,7 @@ cdef int {{prefix}}updating_missing_conventional({{prefix}}KalmanFilter kfilter,
 
 cdef {{cython_type}} {{prefix}}inverse_missing_conventional({{prefix}}KalmanFilter kfilter, {{prefix}}Statespace model, {{cython_type}} determinant)  except *:
     # Since the inverse of the forecast error covariance matrix is not
-    # stored, we don't need to fill it (e.g. with NPY_NAN values). Instead,
+    # stored, we do not need to fill it (e.g. with NPY_NAN values). Instead,
     # just do a noop here and return a zero determinant ($|0|$).
     return 0.0
 
diff --git a/statsmodels/tsa/statespace/_filters/_inversions.pyx.in b/statsmodels/tsa/statespace/_filters/_inversions.pyx.in
index d59684e9c21..d7061bc7cfb 100644
--- a/statsmodels/tsa/statespace/_filters/_inversions.pyx.in
+++ b/statsmodels/tsa/statespace/_filters/_inversions.pyx.in
@@ -207,7 +207,7 @@ cdef {{cython_type}} {{prefix}}inverse_cholesky({{prefix}}KalmanFilter kfilter,
 
         # ?potri only fills in the upper triangle of the symmetric array, and
         # since the ?symm and ?symv routines are not available as of scipy
-        # 0.11.0, we can't use them, so we must fill in the lower triangle
+        # 0.11.0, we cannot use them, so we must fill in the lower triangle
         # by hand
         for i in range(model._k_endog): # columns
             for j in range(i): # rows
diff --git a/statsmodels/tsa/statespace/_filters/_univariate_diffuse.pyx.in b/statsmodels/tsa/statespace/_filters/_univariate_diffuse.pyx.in
index b77f1386533..2254208d6d1 100644
--- a/statsmodels/tsa/statespace/_filters/_univariate_diffuse.pyx.in
+++ b/statsmodels/tsa/statespace/_filters/_univariate_diffuse.pyx.in
@@ -266,7 +266,7 @@ cdef void {{prefix}}predicted_diffuse_state_cov({{prefix}}KalmanFilter kfilter,
         {{cython_type}} beta = 0.0
 
     # Need special handling for the completely missing case, since the
-    # conventional Kalman filter routines are used in this case and they don't
+    # conventional Kalman filter routines are used in this case and they do not
     # copy over the predicted diffuse state cov
     if model._nmissing == model.k_endog:
         blas.{{prefix}}copy(&kfilter.k_states2, kfilter._input_diffuse_state_cov, &inc,
diff --git a/statsmodels/tsa/statespace/_kalman_filter.pyx.in b/statsmodels/tsa/statespace/_kalman_filter.pyx.in
index d107db5e2dd..ba8740e98a5 100644
--- a/statsmodels/tsa/statespace/_kalman_filter.pyx.in
+++ b/statsmodels/tsa/statespace/_kalman_filter.pyx.in
@@ -181,7 +181,7 @@ cdef class {{prefix}}KalmanFilter(object):
     and redefined matrices based on missing values.  
 
     `post_convergence` handles copying arrays from time $t-1$ to time $t$ when
-    the Kalman filter has converged and they don't need to be re-calculated.  
+    the Kalman filter has converged and they do not need to be re-calculated.  
 
     `forecasting` calls the Kalman filter `forcasting_<filter type>` routine
 
@@ -293,7 +293,7 @@ cdef class {{prefix}}KalmanFilter(object):
     # cdef readonly {{cython_type}} [::1,:,:] tmp1, tmp3
 
     # Holds the determinant across calculations (this is done because after
-    # convergence, it doesn't need to be re-calculated anymore)
+    # convergence, it does not need to be re-calculated anymore)
     # cdef readonly {{cython_type}} determinant
 
     # ### Pointers to current-iteration arrays
@@ -915,7 +915,7 @@ cdef class {{prefix}}KalmanFilter(object):
         # Handle missing data
         if self.model._nmissing > 0 or (self.model.has_missing and self.filter_method & FILTER_UNIVARIATE):
             # TODO there is likely a way to allow convergence and the univariate filter, but it
-            # doesn't work "out-of-the-box" right now
+            # does not work "out-of-the-box" right now
             self.converged = 0
 
     cdef void initialize_filter_object_pointers(self):
diff --git a/statsmodels/tsa/statespace/_kalman_smoother.pyx.in b/statsmodels/tsa/statespace/_kalman_smoother.pyx.in
index ae10c3472de..efedda8bd83 100644
--- a/statsmodels/tsa/statespace/_kalman_smoother.pyx.in
+++ b/statsmodels/tsa/statespace/_kalman_smoother.pyx.in
@@ -442,7 +442,7 @@ cdef class {{prefix}}KalmanSmoother(object):
         else:
             _smooth_method = self.smooth_method
 
-        # Make sure we don't have an invalid smooth method for our filter
+        # Make sure we do not have an invalid smooth method for our filter
         # method
         if((_smooth_method & SMOOTH_UNIVARIATE) and not (self.filter_method & FILTER_UNIVARIATE) or 
                 (self.filter_method & FILTER_UNIVARIATE) and not (_smooth_method & SMOOTH_UNIVARIATE)):
@@ -482,7 +482,7 @@ cdef class {{prefix}}KalmanSmoother(object):
         if not t == 0 and t >= self.model.nobs:
             raise IndexError("Observation index out of range")
 
-        # Make sure we haven't changed filter methods in-between seeking
+        # Make sure we have not changed filter methods in-between seeking
         if self.check_filter_method_changed():
             raise RuntimeError("Filter method in associated Kalman filter was"
                                " changed in between smoother seek() calls."
@@ -519,7 +519,7 @@ cdef class {{prefix}}KalmanSmoother(object):
         if not self.t >= 0:
             raise StopIteration
 
-        # Make sure we haven't changed filter methods in-between iterations
+        # Make sure we have not changed filter methods in-between iterations
         if self.check_filter_method_changed():
             raise RuntimeError("Filter method in associated Kalman filter was"
                                " changed in between smoother iterations."
@@ -623,9 +623,9 @@ cdef class {{prefix}}KalmanSmoother(object):
             transform_diagonalize = self.kfilter.filter_method & FILTER_UNIVARIATE
 
         # Initialize object-level pointers to statespace arrays
-        # Note: doesn't matter what transformations were required for the
-        #       filter; we don't need to perform them for the smoother
-        # TODO  actually we do, to get _design, _obs_cov, etc. However we don't
+        # Note: does not matter what transformations were required for the
+        #       filter; we do not need to perform them for the smoother
+        # TODO  actually we do, to get _design, _obs_cov, etc. However we do not
         #       need it to recalculate the selected_obs and loglikelihood, so
         #       need to decouple those parts from the generalized collapse
         self.model.seek(self.t, transform_diagonalize, transform_generalized_collapse)
@@ -646,7 +646,7 @@ cdef class {{prefix}}KalmanSmoother(object):
         # )
 
         # If a collapse should have occurred, the dimensions need to be
-        # adjusted (because we didn't tell the model about the collapse in the
+        # adjusted (because we did not tell the model about the collapse in the
         # seek() call above)
         # if collapse_occurred:
         #     self.model.set_dimensions(self.model.k_states,
diff --git a/statsmodels/tsa/statespace/_representation.pyx.in b/statsmodels/tsa/statespace/_representation.pyx.in
index 6507696b0ea..14c8031e7c7 100644
--- a/statsmodels/tsa/statespace/_representation.pyx.in
+++ b/statsmodels/tsa/statespace/_representation.pyx.in
@@ -628,7 +628,7 @@ cdef class {{prefix}}Statespace(object):
         # functions as two-dimensional, column-major arrays.
         #
         # In the case that all data is missing (e.g. this is what happens in
-        # forecasting), we actually set don't change the dimension, but we set
+        # forecasting), we actually set do not change the dimension, but we set
         # the design matrix to the zeros array.
         if self._nmissing == self.k_endog:
             self._select_missing_entire_obs(t)
@@ -718,7 +718,7 @@ cdef class {{prefix}}Statespace(object):
 
         # Perform the LDL decomposition, if necessary
         if t == 0 or self.obs_cov.shape[2] > 1 or reset_missing:
-            # Make sure we don't have an observation intercept
+            # Make sure we do not have an observation intercept
             if not np.sum(self.obs_intercept) == 0 or self.obs_intercept.shape[2] > 1:
                 raise RuntimeError('The univariate method with non-diagonal observation covariance matrix'
                                    ' does not currently support an observation intercept.')
@@ -833,7 +833,7 @@ cdef class {{prefix}}Statespace(object):
             for i in range(k_states):
                 self.collapse_obs_cov[i,i] = 1
 
-            # Make sure we don't have an observation intercept
+            # Make sure we do not have an observation intercept
             if not np.sum(self.obs_intercept) == 0 or self.obs_intercept.shape[2] > 1:
                 raise RuntimeError('The observation collapse transformation'
                                    ' does not currently support an observation'
diff --git a/statsmodels/tsa/statespace/_simulation_smoother.pyx.in b/statsmodels/tsa/statespace/_simulation_smoother.pyx.in
index 1cb7b45a7cb..a4864adf568 100644
--- a/statsmodels/tsa/statespace/_simulation_smoother.pyx.in
+++ b/statsmodels/tsa/statespace/_simulation_smoother.pyx.in
@@ -219,7 +219,7 @@ cdef class {{prefix}}SimulationSmoother(object):
 
         # Initialize the simulated model memoryviews
         # Note: the actual initialization is replaced in the simulate()
-        # function below, but will complain if the memoryviews haven't been
+        # function below, but will complain if the memoryviews have not been
         # first initialized, which this call does.
         self.simulated_model.initialize_approximate_diffuse()
         if self.has_missing:
diff --git a/statsmodels/tsa/statespace/_tools.pyx.in b/statsmodels/tsa/statespace/_tools.pyx.in
index 2253456fe09..65827f305e6 100644
--- a/statsmodels/tsa/statespace/_tools.pyx.in
+++ b/statsmodels/tsa/statespace/_tools.pyx.in
@@ -97,7 +97,7 @@ cdef int _{{prefix}}solve_discrete_lyapunov({{cython_type}} * a, {{cython_type}}
     cdef np.npy_intp dim[2]
     cdef {{cython_type}} [::1,:] apI, capI, u, v
     cdef int [::1,:] ipiv
-    # Dummy selection function, won't actually be referenced since we don't
+    # Dummy selection function, will not actually be referenced since we do not
     # need to order the eigenvalues in the ?gees call.
     cdef:
         int sdim
@@ -370,7 +370,7 @@ cpdef _{{prefix}}compute_coefficients_from_multivariate_pacf({{cython_type}} [::
     if not transform_variance:
         initial_variance = np.asfortranarray(error_variance.copy())
         # Need to make the input variance large enough that the recursions
-        # don't lead to zero-matrices due to roundoff error, which would case
+        # do not lead to zero-matrices due to roundoff error, which would case
         # exceptions from the Cholesky decompositions.
         # Note that this will still not always ensure positive definiteness,
         # and for k_endog, order large enough an exception may still be raised
diff --git a/statsmodels/tsa/statespace/kalman_filter.py b/statsmodels/tsa/statespace/kalman_filter.py
index 3ce202e5275..e1051bd4c80 100644
--- a/statsmodels/tsa/statespace/kalman_filter.py
+++ b/statsmodels/tsa/statespace/kalman_filter.py
@@ -1218,8 +1218,8 @@ def impulse_responses(self, steps=10, impulse=0, orthogonalized=False,
             impulse = np.dot(state_chol, impulse)
 
         # If we have a time-invariant system, we can solve for the IRF directly
-        # Note that it doesn't matter if we have time-invariant intercepts,
-        # since those don't affect the IRF anyway
+        # Note that it does not matter if we have time-invariant intercepts,
+        # since those do not affect the IRF anyway
         time_invariant = (
             self._design.shape[2] == self._obs_cov.shape[2] ==
             self._transition.shape[2] == self._selection.shape[2] ==
@@ -2068,7 +2068,7 @@ def predict(self, start=None, end=None, dynamic=None, **kwargs):
                                  nforecast)
 
     def _predict(self, nstatic, ndynamic, nforecast, model):
-        # Note: this doesn't use self, and can either be a static method or
+        # Note: this does not use self, and can either be a static method or
         #       moved outside the class altogether.
 
         # Get the underlying filter
diff --git a/statsmodels/tsa/statespace/mlemodel.py b/statsmodels/tsa/statespace/mlemodel.py
index 598fbdde658..33e8858ac45 100644
--- a/statsmodels/tsa/statespace/mlemodel.py
+++ b/statsmodels/tsa/statespace/mlemodel.py
@@ -185,7 +185,7 @@ def clone(self, endog, exog=None, **kwargs):
         raise NotImplementedError
 
     def _clone_from_init_kwds(self, endog, exog=None, **kwargs):
-        # Can't make this the default, because there is extra work required
+        # Cannot make this the default, because there is extra work required
         # for subclasses to make _get_init_kwds useful.
         use_kwargs = self._get_init_kwds()
         use_kwargs.update(kwargs)
@@ -713,7 +713,7 @@ def loglikeobs(self, params, transformed=True, complex_step=False,
         if not transformed:
             params = self.transform_params(params)
 
-        # If we're using complex-step differentiation, then we can't use
+        # If we're using complex-step differentiation, then we cannot use
         # Cholesky factorization
         if complex_step:
             kwargs['inversion_method'] = INVERT_UNIVARIATE | SOLVE_LU
@@ -748,7 +748,7 @@ def _forecasts_error_partial_derivatives(self, params, transformed=True,
                                              res=None, **kwargs):
         params = np.array(params, ndmin=1)
 
-        # We can't use complex-step differentiation with non-transformed
+        # We cannot use complex-step differentiation with non-transformed
         # parameters
         if approx_complex_step is None:
             approx_complex_step = transformed
@@ -757,7 +757,7 @@ def _forecasts_error_partial_derivatives(self, params, transformed=True,
                              " calculate the observed_information_matrix"
                              " with untransformed parameters.")
 
-        # If we're using complex-step differentiation, then we can't use
+        # If we're using complex-step differentiation, then we cannot use
         # Cholesky factorization
         if approx_complex_step:
             kwargs['inversion_method'] = INVERT_UNIVARIATE | SOLVE_LU
@@ -872,7 +872,7 @@ def observed_information_matrix(self, params, transformed=True,
         # Setup
         n = len(params)
 
-        # We can't use complex-step differentiation with non-transformed
+        # We cannot use complex-step differentiation with non-transformed
         # parameters
         if approx_complex_step is None:
             approx_complex_step = transformed
@@ -884,7 +884,7 @@ def observed_information_matrix(self, params, transformed=True,
         # Get values at the params themselves
         self.update(params, transformed=transformed,
                     complex_step=approx_complex_step)
-        # If we're using complex-step differentiation, then we can't use
+        # If we're using complex-step differentiation, then we cannot use
         # Cholesky factorization
         if approx_complex_step:
             kwargs['inversion_method'] = INVERT_UNIVARIATE | SOLVE_LU
@@ -946,7 +946,7 @@ def opg_information_matrix(self, params, transformed=True,
         NBER Chapters. National Bureau of Economic Research, Inc.
 
         """
-        # We can't use complex-step differentiation with non-transformed
+        # We cannot use complex-step differentiation with non-transformed
         # parameters
         if approx_complex_step is None:
             approx_complex_step = transformed
@@ -1462,7 +1462,7 @@ def simulate(self, params, nsimulations, measurement_shocks=None,
         simulated_obs, simulated_states = self.ssm.simulate(
             nsimulations, measurement_shocks, state_shocks, initial_state)
 
-        # Simulated obs is (nobs x k_endog); don't want to squeeze in
+        # Simulated obs is (nobs x k_endog); do not want to squeeze in
         # case of nsimulations = 1
         if simulated_obs.shape[1] == 1:
             simulated_obs = simulated_obs[:, 0]
@@ -1517,7 +1517,7 @@ def impulse_responses(self, params, steps=1, impulse=0,
         irfs = self.ssm.impulse_responses(
             steps, impulse, orthogonalized, cumulative, **kwargs)
 
-        # IRF is (nobs x k_endog); don't want to squeeze in case of steps = 1
+        # IRF is (nobs x k_endog); do not want to squeeze in case of steps = 1
         if irfs.shape[1] == 1:
             irfs = irfs[:, 0]
 
@@ -2002,7 +2002,7 @@ def fittedvalues(self):
         """
         (array) The predicted values of the model. An (nobs x k_endog) array.
         """
-        # This is a (k_endog x nobs array; don't want to squeeze in case of
+        # This is a (k_endog x nobs array; do not want to squeeze in case of
         # the corner case where nobs = 1 (mostly a concern in the predict or
         # forecast functions, but here also to maintain consistency)
         fittedvalues = self.forecasts
@@ -2057,7 +2057,7 @@ def resid(self):
         """
         (array) The model residuals. An (nobs x k_endog) array.
         """
-        # This is a (k_endog x nobs array; don't want to squeeze in case of
+        # This is a (k_endog x nobs array; do not want to squeeze in case of
         # the corner case where nobs = 1 (mostly a concern in the predict or
         # forecast functions, but here also to maintain consistency)
         resid = self.forecasts_error
@@ -2399,7 +2399,7 @@ def get_prediction(self, start=None, end=None, dynamic=False,
             dynamic, _, _ = self.model._get_index_loc(dynamic)
 
         # Perform the prediction
-        # This is a (k_endog x npredictions) array; don't want to squeeze in
+        # This is a (k_endog x npredictions) array; do not want to squeeze in
         # case of npredictions = 1
         prediction_results = self.filter_results.predict(
             start, end + out_of_sample + 1, dynamic, **kwargs)
@@ -2609,7 +2609,7 @@ def append(self, endog, exog=None, refit=False, **kwargs):
         exog : array_like, optional
             New observations of exogenous regressors, if applicable.
         refit : bool, optional
-            Wheter to re-fit the parameters, based on the combined dataset.
+            Whether to re-fit the parameters, based on the combined dataset.
             Default is False (so parameters from the current results object
             are used to create the new results object).
         **kwargs
@@ -2795,7 +2795,7 @@ def apply(self, endog, exog=None, refit=False, **kwargs):
         exog : array_like, optional
             New observations of exogenous regressors, if applicable.
         refit : bool, optional
-            Wheter to re-fit the parameters, using the new dataset.
+            Whether to re-fit the parameters, using the new dataset.
             Default is False (so parameters from the current results object
             are used to create the new results object).
         **kwargs
@@ -3168,7 +3168,7 @@ def se_mean(self):
 
     def conf_int(self, method='endpoint', alpha=0.05, **kwds):
         # TODO: this performs metadata wrapping, and that should be handled
-        #       by attach_* methods. However, they don't currently support
+        #       by attach_* methods. However, they do not currently support
         #       this use case.
         conf_int = super(PredictionResults, self).conf_int(
             method, alpha, **kwds)
@@ -3206,8 +3206,8 @@ def summary_frame(self, endog=0, what='all', alpha=0.05):
         to_include['mean_ci_lower'] = ci_mean[:, endog]
         to_include['mean_ci_upper'] = ci_mean[:, k_endog + endog]
 
-        # OrderedDict doesn't work to preserve sequence
-        # pandas dict doesn't handle 2d_array
+        # OrderedDict does not work to preserve sequence
+        # pandas dict does not handle 2d_array
         # data = np.column_stack(list(to_include.values()))
         # names = ....
         res = pd.DataFrame(to_include, index=self.row_labels,
diff --git a/statsmodels/tsa/statespace/sarimax.py b/statsmodels/tsa/statespace/sarimax.py
index 7c67dbcd404..442ec6c7581 100644
--- a/statsmodels/tsa/statespace/sarimax.py
+++ b/statsmodels/tsa/statespace/sarimax.py
@@ -254,7 +254,7 @@ class SARIMAX(MLEModel):
     estimation.
 
     In this implementation of differenced models, the Hamilton representation
-    is not able to accomodate differencing in the state vector, so
+    is not able to accommodate differencing in the state vector, so
     `simple_differencing` (which performs differencing prior to estimation so
     that the first d + sD observations are lost) must be used.
 
@@ -608,7 +608,7 @@ def initialize(self):
 
         # Save the indices corresponding to the reduced form lag polynomial
         # parameters in the transition and selection matrices so that they
-        # don't have to be recalculated for each update()
+        # do not have to be recalculated for each update()
         start_row = self._k_states_diff
         end_row = start_row + self.k_ar + self.k_seasonal_ar
         col = self._k_states_diff
@@ -659,7 +659,7 @@ def initialize_default(self, approximate_diffuse_variance=None):
             init.set((self._k_states_diff + self._k_order,
                       self._k_states_diff + self._k_order + self.k_exog),
                      'approximate_diffuse')
-        # If we're not enforcing a stationarity, then we can't initialize a
+        # If we're not enforcing a stationarity, then we cannot initialize a
         # stationary component
         else:
             init.set(None, 'approximate_diffuse')
diff --git a/statsmodels/tsa/statespace/simulation_smoother.py b/statsmodels/tsa/statespace/simulation_smoother.py
index 29a83c3d326..8283a844ea8 100644
--- a/statsmodels/tsa/statespace/simulation_smoother.py
+++ b/statsmodels/tsa/statespace/simulation_smoother.py
@@ -96,7 +96,7 @@ def get_simulation_output(self, simulation_output=None,
             Additional keyword arguments. Present so that calls to this method
             can use \*\*kwargs without clearing out additional arguments.
         """
-        # If we don't explicitly have simulation_output, try to get it from
+        # If we do not explicitly have simulation_output, try to get it from
         # kwargs
         if simulation_output is None:
             simulation_output = 0
@@ -111,7 +111,7 @@ def get_simulation_output(self, simulation_output=None,
             # Handle case of no information in kwargs
             if simulation_output == 0:
 
-                # If some arguments were passed, but we still don't have any
+                # If some arguments were passed, but we still do not have any
                 # simulation output, raise an exception
                 argument_set = not all([
                     simulate_state is None, simulate_disturbance is None,
diff --git a/statsmodels/tsa/statespace/structural.py b/statsmodels/tsa/statespace/structural.py
index 38e9046c130..6568d45af18 100644
--- a/statsmodels/tsa/statespace/structural.py
+++ b/statsmodels/tsa/statespace/structural.py
@@ -513,7 +513,7 @@ def __init__(self, endog, level=False, trend=False, seasonal=None,
             self.trend * self.stochastic_trend * 0x10
         )
 
-        # Create the trend specification, if it wasn't given
+        # Create the trend specification, if it was not given
         if self.trend_specification is None:
             # trend specification may be none, e.g. if the model is only
             # a stochastic cycle, etc.
@@ -549,8 +549,8 @@ def __init__(self, endog, level=False, trend=False, seasonal=None,
             self.autoregressive
         )
 
-        # The ar states are initialized as stationary, so they don't need to be
-        # burned.
+        # The ar states are initialized as stationary, so they do not need to
+        # be burned.
         loglikelihood_burn = kwargs.get('loglikelihood_burn',
                                         k_states
                                         - self.ar_order)
@@ -874,7 +874,7 @@ def start_params(self):
         if self.cycle:
             _start_params['cycle_var'] = var_resid
             # Clip this to make sure it is positive and strictly stationary
-            # (i.e. don't want negative or 1)
+            # (i.e. do not want negative or 1)
             _start_params['cycle_damp'] = np.clip(
                 np.linalg.pinv(resid[:-1, None]).dot(resid[1:])[0], 0, 0.99
             )
diff --git a/statsmodels/tsa/statespace/tests/results/test_exact_diffuse_filtering_stata.do b/statsmodels/tsa/statespace/tests/results/test_exact_diffuse_filtering_stata.do
index 05696c34f7c..e4ff1f1901c 100644
--- a/statsmodels/tsa/statespace/tests/results/test_exact_diffuse_filtering_stata.do
+++ b/statsmodels/tsa/statespace/tests/results/test_exact_diffuse_filtering_stata.do
@@ -44,7 +44,7 @@ disp %20.19g e(ll)
 
 
 // Local linear trend + missing
-// Have to skip this since Stata doesn't allow missing values, even using sspace
+// Have to skip this since Stata does not allow missing values, even using sspace
 // clear
 // input x t
 // 10.2394 1
diff --git a/statsmodels/tsa/statespace/tests/results/test_ucm.R b/statsmodels/tsa/statespace/tests/results/test_ucm.R
index 8973667822e..9289da3f4ba 100644
--- a/statsmodels/tsa/statespace/tests/results/test_ucm.R
+++ b/statsmodels/tsa/statespace/tests/results/test_ucm.R
@@ -122,7 +122,7 @@ print(exp(res_reg$optim.out$par)) # [2.215447924]
 print(res_reg$optim.out$value)    # 379.6233483
 
 # Random trend + AR(1)
-# Note: KFAS doesn't want to estimate these parameters, so just fix them
+# Note: KFAS does not want to estimate these parameters, so just fix them
 # to the MLE estimates from Statsmodels and compare the loglikelihood
 # mod.update([])
 mod_rtrend_ar1 <- SSModel(dta$unemp ~ SSMtrend(2, Q=list(matrix(0), matrix(0.0609))) + SSMarima(ar=c(0.9592), Q=matrix(0.0097)), H=matrix(0))
diff --git a/statsmodels/tsa/statespace/tests/test_concentrated.py b/statsmodels/tsa/statespace/tests/test_concentrated.py
index 8e0911ea956..2d5e3776ba3 100644
--- a/statsmodels/tsa/statespace/tests/test_concentrated.py
+++ b/statsmodels/tsa/statespace/tests/test_concentrated.py
@@ -42,7 +42,7 @@ def get_sarimax_models(endog, filter_univariate=False, **kwargs):
 
 
 def test_concentrated_loglike_sarimax():
-    # Note: we won't use the "concentrate_scale" option to SARIMAX for this
+    # Note: we will not use the "concentrate_scale" option to SARIMAX for this
     # test, which is a lower-level test of the Kalman filter using the SARIMAX
     # model as an example
     nobs = 30
@@ -97,7 +97,7 @@ def test_concentrated_loglike_sarimax():
 
 
 def test_concentrated_predict_sarimax():
-    # Note: we won't use the "concentrate_scale" option to SARIMAX for this
+    # Note: we will not use the "concentrate_scale" option to SARIMAX for this
     # test, which is a lower-level test of the Kalman filter using the SARIMAX
     # model as an example
     nobs = 30
@@ -181,7 +181,7 @@ def check_concentrated_scale(filter_univariate=False, missing=False, **kwargs):
     mod_orig.ssm.filter_univariate = filter_univariate
     mod_conc.ssm.filter_univariate = filter_univariate
 
-    # Since VARMAX doesn't explicitly allow concentrating out the scale, for
+    # Since VARMAX does not explicitly allow concentrating out the scale, for
     # now we will simulate it by setting the first variance to be 1.
     # Note that start_scale will not be the scale used for the non-concentrated
     # model, because we need to use the MLE scale estimated by the
@@ -233,7 +233,7 @@ def check_concentrated_scale(filter_univariate=False, missing=False, **kwargs):
         desired = getattr(res_orig.filter_results, name)
         assert_allclose(actual, desired, atol=1e-7)
 
-    # Note: don't want to compare the elements from any diffuse
+    # Note: do not want to compare the elements from any diffuse
     # initialization for things like covariances, so only compare for
     # periods past the loglikelihood_burn period
     filter_attr_burn = ['standardized_forecasts_error',
diff --git a/statsmodels/tsa/statespace/tests/test_dynamic_factor.py b/statsmodels/tsa/statespace/tests/test_dynamic_factor.py
index 7414a1b32d0..e9e69d6a2e7 100644
--- a/statsmodels/tsa/statespace/tests/test_dynamic_factor.py
+++ b/statsmodels/tsa/statespace/tests/test_dynamic_factor.py
@@ -99,7 +99,7 @@ def test_plot_coefficients_of_determination(self, close_figures):
 
     def test_no_enforce(self):
         return
-        # Test that nothing goes wrong when we don't enforce stationarity
+        # Test that nothing goes wrong when we do not enforce stationarity
         params = self.model.untransform_params(self.true['params'])
         params[self.model._params_transition] = (
             self.true['params'][self.model._params_transition])
@@ -183,24 +183,24 @@ def setup_class(cls):
             true, k_factors=2, factor_order=1)
 
     def test_mle(self):
-        # Stata's MLE on this model doesn't converge, so no reason to check
+        # Stata's MLE on this model does not converge, so no reason to check
         pass
 
     def test_bse(self):
-        # Stata's MLE on this model doesn't converge, and four of their
-        # params don't even have bse (possibly they are still at starting
+        # Stata's MLE on this model does not converge, and four of their
+        # params do not even have bse (possibly they are still at starting
         # values?), so no reason to check this
         pass
 
     def test_aic(self):
         # Stata uses 9 df (i.e. 9 params) here instead of 13, because since the
-        # model didn't coverge, 4 of the parameters aren't fully estimated
+        # model did not coverge, 4 of the parameters are not fully estimated
         # (possibly they are still at starting values?) so the AIC is off
         pass
 
     def test_bic(self):
         # Stata uses 9 df (i.e. 9 params) here instead of 13, because since the
-        # model didn't coverge, 4 of the parameters aren't fully estimated
+        # model did not coverge, 4 of the parameters are not fully estimated
         # (possibly they are still at starting values?) so the BIC is off
         pass
 
diff --git a/statsmodels/tsa/statespace/tests/test_exact_diffuse_filtering.py b/statsmodels/tsa/statespace/tests/test_exact_diffuse_filtering.py
index 50474c74432..bd65b877f15 100644
--- a/statsmodels/tsa/statespace/tests/test_exact_diffuse_filtering.py
+++ b/statsmodels/tsa/statespace/tests/test_exact_diffuse_filtering.py
@@ -353,7 +353,7 @@ def test_common_level_analytic():
     # Output of the exact diffuse initialization, see Koopman (1997)
 
     # Note: since Koopman (1997) did not apply the univariate method,
-    # forecast errors and covariances, and the Kalman gain won't match
+    # forecast errors and covariances, and the Kalman gain will not match
     # assert_allclose(res.forecasts_error[:, 0], [y11, y21])
     # assert_allclose(res.forecasts_error_cov[:, :, 0], np.eye(2))
     # F_inf1 = np.array([[1, theta],
@@ -390,7 +390,7 @@ def test_common_level_restricted_analytic():
     # Output of the exact diffuse initialization, see Koopman (1997)
     phi = 1 / (1 + theta**2)
     # Note: since Koopman (1997) did not apply the univariate method,
-    # forecast errors and covariances, and the Kalman gain won't match
+    # forecast errors and covariances, and the Kalman gain will not match
     # assert_allclose(res.forecasts_error[:, 0], [y11, y21])
     # assert_allclose(res.forecasts_error_cov[0, 0, 0], np.eye(2))
     # F_inf1 = np.array([[1, theta],
@@ -536,7 +536,7 @@ def test_smoothed_state_disturbance_cov(self, rtol_diffuse=1e-5):
 
     # - Smoothed intermediate tests ------------------------------------------
 
-    @pytest.mark.skip("This isn't computed in the univariate method or "
+    @pytest.mark.skip("This is not computed in the univariate method or "
                       "by KFAS.")
     def test_smoothing_error(self, rtol_diffuse=None):
         actual = self.results_a.smoothing_error
@@ -554,7 +554,7 @@ def test_scaled_smoothed_estimator_cov(self, rtol_diffuse=1e-5):
         self.check_object(actual, desired, rtol_diffuse)
 
     # - Diffuse objects tests ------------------------------------------------
-    # Note: these can't be checked against the approximate diffuse method.
+    # Note: these cannot be checked against the approximate diffuse method.
 
     def test_forecasts_error_diffuse_cov(self, rtol_diffuse=None):
         actual = self.results_a.forecasts_error_diffuse_cov
@@ -567,7 +567,7 @@ def test_predicted_diffuse_state_cov(self, rtol_diffuse=None):
         self.check_object(actual, desired, rtol_diffuse)
 
     # TODO: do something with this other than commenting it out?
-    # We don't currently store this array
+    # We do not currently store this array
     # def test_kalman_gain_diffuse(self, rtol_diffuse=None):
     #     actual = self.results_a.
     #     desired = self.results_b.
@@ -725,8 +725,8 @@ class TestVAR1MeasurementError_Approx(CheckApproximateDiffuseMixin,
                                       CheckVAR1MeasurementError):
     # Note: somewhat fragile, we need to increase the approximate variance to
     # 1e9 for the tests to pass at the appropriate level of precision, but
-    # we can't increase too much more than this because then we start get
-    # numerical errors (e.g. 1e10 is fine but 1e11 doesn't pass)
+    # we cannot increase too much more than this because then we start get
+    # numerical errors (e.g. 1e10 is fine but 1e11 does not pass)
     approximate_diffuse_variance = 1e9
 
     def test_smoothed_measurement_disturbance_cov(self, rtol_diffuse=None):
@@ -770,8 +770,8 @@ def test_nobs_diffuse(self):
 class TestVAR1Missing_Approx(CheckApproximateDiffuseMixin, CheckVAR1Missing):
     # Note: somewhat fragile, we need to increase the approximate variance to
     # 1e10 for the tests to pass at the appropriate level of precision, but
-    # we can't increase it any more than this because then we start get
-    # numerical errors (e.g. 1e11 doesn't pass)
+    # we cannot increase it any more than this because then we start get
+    # numerical errors (e.g. 1e11 does not pass)
     approximate_diffuse_variance = 1e10
 
     def test_smoothed_state_cov(self, rtol_diffuse=None):
@@ -900,8 +900,8 @@ def test_initialization(self):
 class TestDFM_Approx(CheckApproximateDiffuseMixin, CheckDFM):
     # Note: somewhat fragile, we need to increase the approximate variance to
     # 5e10 for the tests to pass at the appropriate level of precision, but
-    # we can't increase it too much more than this because then we start get
-    # numerical errors (e.g. 1e11 works but 1e12 doesn't pass)
+    # we cannot increase it too much more than this because then we start get
+    # numerical errors (e.g. 1e11 works but 1e12 does not pass)
     approximate_diffuse_variance = 5e10
 
 
@@ -947,13 +947,13 @@ def test_initialization(self):
 class TestDFMCollapsed_Approx(CheckApproximateDiffuseMixin, CheckDFMCollapsed):
     # Note: somewhat fragile, we need to increase the approximate variance to
     # 1e9 for the tests to pass at the appropriate level of precision, but
-    # we can't increase it too much more than this because then we start get
-    # numerical errors (e.g. 1e10 doesn't pass)
+    # we cannot increase it too much more than this because then we start get
+    # numerical errors (e.g. 1e10 does not pass)
     approximate_diffuse_variance = 1e9
 
 
-# FIXME: don't leave this commented-out
-# Note: we cannot test against KFAS, since it doesn't support collapsed
+# FIXME: do not leave this commented-out
+# Note: we cannot test against KFAS, since it does not support collapsed
 # filtering
 # class TestDFMCollapsed_KFAS(CheckKFASMixin, TestDFMCollapsed):
 #     results_path = os.path.join(
diff --git a/statsmodels/tsa/statespace/tests/test_models.py b/statsmodels/tsa/statespace/tests/test_models.py
index 8f518f2c9f7..2a184ba893b 100644
--- a/statsmodels/tsa/statespace/tests/test_models.py
+++ b/statsmodels/tsa/statespace/tests/test_models.py
@@ -20,7 +20,7 @@
 
 class Intercepts(mlemodel.MLEModel):
     """
-    Test class for observation and state intercepts (which usually don't
+    Test class for observation and state intercepts (which usually do not
     get tested in other models).
     """
     def __init__(self, endog, **kwargs):
@@ -195,7 +195,7 @@ def test_smoothed_measurement_disturbance_cov(self):
 
 class LargeStateCovAR1(mlemodel.MLEModel):
     """
-    Test class for k_posdef > k_states (which usually don't get tested in
+    Test class for k_posdef > k_states (which usually do not get tested in
     other models).
 
     This is just an AR(1) model with an extra unused state innovation
diff --git a/statsmodels/tsa/statespace/tests/test_representation.py b/statsmodels/tsa/statespace/tests/test_representation.py
index f36ecbe18b4..70d146887ed 100644
--- a/statsmodels/tsa/statespace/tests/test_representation.py
+++ b/statsmodels/tsa/statespace/tests/test_representation.py
@@ -744,7 +744,7 @@ def test_cython():
     mod._initialize_filter()
     kf = mod._kalman_filters['d']
 
-    # Rebind data, still float, check that we haven't changed
+    # Rebind data, still float, check that we have not changed
     mod.bind(endog)
     mod._initialize_filter()
     assert_equal(mod._kalman_filter, kf)
diff --git a/statsmodels/tsa/statespace/tests/test_sarimax.py b/statsmodels/tsa/statespace/tests/test_sarimax.py
index 2af7a7ff8b5..17cd84632a2 100644
--- a/statsmodels/tsa/statespace/tests/test_sarimax.py
+++ b/statsmodels/tsa/statespace/tests/test_sarimax.py
@@ -977,7 +977,7 @@ def test_loglike(self):
         )
 
     def test_start_params(self):
-        # just a quick test that start_params isn't throwing an exception
+        # just a quick test that start_params is not throwing an exception
         # (other than related to invertibility)
         stat = self.model.enforce_stationarity
         inv = self.model.enforce_invertibility
@@ -1970,7 +1970,7 @@ def test_simple_time_varying():
         time_varying_regression=True,
         mle_regression=False)
 
-    # Ignore the warning that MLE doesn't converge
+    # Ignore the warning that MLE does not converge
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         res = mod.fit(disp=-1)
@@ -2267,7 +2267,7 @@ def check_concentrated_scale(filter_univariate=False):
         # the non-concentrated model will expect as parameters
         if kwargs['time_varying_regression'] and kwargs['exog'] is not None:
             k_snr += 1
-        # Note: the log-likelihood isn't exactly the same between concentrated
+        # Note: the log-likelihood is not exactly the same between concentrated
         # and non-concentrated models with time-varying regression, so this
         # combinations raises NotImplementedError.
 
@@ -2309,7 +2309,7 @@ def check_concentrated_scale(filter_univariate=False):
             desired = getattr(res_orig.filter_results, name)
             assert_allclose(actual, desired, atol=atol)
 
-        # Note: don't want to compare the elements from any diffuse
+        # Note: do not want to compare the elements from any diffuse
         # initialization for things like covariances, so only compare for
         # periods past the loglikelihood_burn period
         filter_attr_burn = ['llf_obs', 'standardized_forecasts_error',
@@ -2386,7 +2386,7 @@ def check_concentrated_scale(filter_univariate=False):
         assert_allclose(actual.se_mean, desired.se_mean, atol=atol)
 
         # Test simulate
-        # Simulate is currently broken for time-varying models, so don't try
+        # Simulate is currently broken for time-varying models, so do not try
         # to test it here
         np.random.seed(13847)
         if mod_conc.ssm.time_invariant:
diff --git a/statsmodels/tsa/statespace/tests/test_simulate.py b/statsmodels/tsa/statespace/tests/test_simulate.py
index 8da1bc95e1f..4a34625b7e0 100644
--- a/statsmodels/tsa/statespace/tests/test_simulate.py
+++ b/statsmodels/tsa/statespace/tests/test_simulate.py
@@ -19,7 +19,7 @@
 def test_arma_lfilter():
     # Tests of an ARMA model simulation against scipy.signal.lfilter
     # Note: the first elements of the generated SARIMAX datasets are based on
-    # the initial state, so we don't include them in the comparisons
+    # the initial state, so we do not include them in the comparisons
     np.random.seed(10239)
     nobs = 100
     eps = np.random.normal(size=nobs)
@@ -50,7 +50,7 @@ def test_arma_direct():
     # Tests of an ARMA model simulation against direct construction
     # This is useful for e.g. trend components
     # Note: the first elements of the generated SARIMAX datasets are based on
-    # the initial state, so we don't include them in the comparisons
+    # the initial state, so we do not include them in the comparisons
     np.random.seed(10239)
     nobs = 100
     eps = np.random.normal(size=nobs)
diff --git a/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py b/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py
index d1b7a050342..866aadb52c8 100644
--- a/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py
+++ b/statsmodels/tsa/statespace/tests/test_simulation_smoothing.py
@@ -395,7 +395,7 @@ class TestMultivariateVARKnownMissingAll(MultivariateVARKnown):
     """
     Notes
     -----
-    Can't test against KFAS because they have a different behavior for
+    Cannot test against KFAS because they have a different behavior for
     missing entries. When an entry is missing, KFAS does not draw a simulation
     smoothed value for that entry, whereas we draw from the unconditional
     distribution. It appears there is nothing to definitively recommend one
diff --git a/statsmodels/tsa/statespace/tests/test_var.py b/statsmodels/tsa/statespace/tests/test_var.py
index 97adc956666..1768f9ab4cd 100644
--- a/statsmodels/tsa/statespace/tests/test_var.py
+++ b/statsmodels/tsa/statespace/tests/test_var.py
@@ -196,7 +196,7 @@ def test_var_ctt():
 
     # VAR(2), constant, trend, and trend**2, no exog
     # Note that this is comparing against trend as exog in the R package,
-    # since it doesn't have a built-in option for trend**2
+    # since it does not have a built-in option for trend**2
     results = results_var_R.res_ctt_as_exog1
     mod = varmax.VARMAX(endog, order=(2, 0), trend='ctt',
                         loglikelihood_burn=2)
diff --git a/statsmodels/tsa/statespace/tests/test_varmax.py b/statsmodels/tsa/statespace/tests/test_varmax.py
index 8faa0c2469c..1ac4ae78da6 100644
--- a/statsmodels/tsa/statespace/tests/test_varmax.py
+++ b/statsmodels/tsa/statespace/tests/test_varmax.py
@@ -30,7 +30,7 @@
 class CheckVARMAX(object):
     """
     Test Vector Autoregression against Stata's `dfactor` code (Stata's
-    `var` function uses OLS and not state space / MLE, so we can't get
+    `var` function uses OLS and not state space / MLE, so we cannot get
     equivalent log-likelihoods)
     """
 
@@ -324,7 +324,7 @@ def setup_class(cls):
         cls.results2 = cls.model.smooth(params)
 
     def test_mle(self):
-        # With the additional measurment error parameters, this wouldn't be
+        # With the additional measurment error parameters, this would not be
         # a meaningful test
         pass
 
@@ -692,7 +692,7 @@ def setup_class(cls):
 
     def test_mle(self):
         # Since the VARMA model here is generic (we're just forcing zeros
-        # in some params) whereas Stata's is restricted, the MLE test isn't
+        # in some params) whereas Stata's is restricted, the MLE test is not
         # meaninful
         pass
 
@@ -791,7 +791,7 @@ def setup_class(cls):
 
     def test_mle(self):
         # Since the VARMA model here is generic (we're just forcing zeros
-        # in some params) whereas Stata's is restricted, the MLE test isn't
+        # in some params) whereas Stata's is restricted, the MLE test is not
         # meaninful
         pass
 
diff --git a/statsmodels/tsa/statespace/tools.py b/statsmodels/tsa/statespace/tools.py
index adfbd699e76..6e787fb87ab 100644
--- a/statsmodels/tsa/statespace/tools.py
+++ b/statsmodels/tsa/statespace/tools.py
@@ -191,9 +191,9 @@ def companion_matrix(polynomial):
 
         if isinstance(polynomial, list) or isinstance(polynomial, tuple):
             try:
-                # Note: can't use polynomial[0] because of the special behavior
-                # associated with matrix polynomials and the constant 1, see
-                # below.
+                # Note: cannot use polynomial[0] because of the special
+                # behavior associated with matrix polynomials and the constant
+                # 1, see below.
                 m = len(polynomial[1])
             except TypeError:
                 m = 1
@@ -593,7 +593,7 @@ def _compute_coefficients_from_multivariate_pacf_python(
     if not transform_variance:
         initial_variance = error_variance
         # Need to make the input variance large enough that the recursions
-        # don't lead to zero-matrices due to roundoff error, which would case
+        # do not lead to zero-matrices due to roundoff error, which would case
         # exceptions from the Cholesky decompositions.
         # Note that this will still not always ensure positive definiteness,
         # and for k_endog, order large enough an exception may still be raised
@@ -1426,7 +1426,7 @@ def validate_matrix_shape(name, shape, nrows, ncols, nobs):
         raise ValueError('Invalid dimensions for %s matrix: requires %d'
                          ' columns, got %d' % (name, ncols, shape[1]))
 
-    # If we don't yet know `nobs`, don't allow time-varying arrays
+    # If we do not yet know `nobs`, do not allow time-varying arrays
     if nobs is None and not (ndim == 2 or shape[-1] == 1):
         raise ValueError('Invalid dimensions for %s matrix: time-varying'
                          ' matrices cannot be given unless `nobs` is specified'
@@ -1473,7 +1473,7 @@ def validate_vector_shape(name, shape, nrows, nobs):
         raise ValueError('Invalid dimensions for %s vector: requires %d'
                          ' rows, got %d' % (name, nrows, shape[0]))
 
-    # If we don't yet know `nobs`, don't allow time-varying arrays
+    # If we do not yet know `nobs`, do not allow time-varying arrays
     if nobs is None and not (ndim == 1 or shape[-1] == 1):
         raise ValueError('Invalid dimensions for %s vector: time-varying'
                          ' vectors cannot be given unless `nobs` is specified'
@@ -1614,8 +1614,8 @@ def copy_missing_matrix(A, B, missing, missing_rows=False, missing_cols=False,
     if not inplace:
         B = np.copy(B, order='F')
 
-    # We may have been given an F-contiguous memoryview; in that case, we don't
-    # want to alter it or convert it to a numpy array
+    # We may have been given an F-contiguous memoryview; in that case, we do
+    # not want to alter it or convert it to a numpy array
     try:
         if not A.is_f_contig():
             raise ValueError()
@@ -1660,8 +1660,8 @@ def copy_missing_vector(a, b, missing, inplace=False, prefix=None):
     if not inplace:
         b = np.copy(b, order='F')
 
-    # We may have been given an F-contiguous memoryview; in that case, we don't
-    # want to alter it or convert it to a numpy array
+    # We may have been given an F-contiguous memoryview; in that case, we do
+    # not want to alter it or convert it to a numpy array
     try:
         if not a.is_f_contig():
             raise ValueError()
@@ -1717,8 +1717,8 @@ def copy_index_matrix(A, B, index, index_rows=False, index_cols=False,
     if not inplace:
         B = np.copy(B, order='F')
 
-    # We may have been given an F-contiguous memoryview; in that case, we don't
-    # want to alter it or convert it to a numpy array
+    # We may have been given an F-contiguous memoryview; in that case, we do
+    # not want to alter it or convert it to a numpy array
     try:
         if not A.is_f_contig():
             raise ValueError()
@@ -1763,8 +1763,8 @@ def copy_index_vector(a, b, index, inplace=False, prefix=None):
     if not inplace:
         b = np.copy(b, order='F')
 
-    # We may have been given an F-contiguous memoryview; in that case, we don't
-    # want to alter it or convert it to a numpy array
+    # We may have been given an F-contiguous memoryview; in that case, we do
+    # not want to alter it or convert it to a numpy array
     try:
         if not a.is_f_contig():
             raise ValueError()
diff --git a/statsmodels/tsa/statespace/varmax.py b/statsmodels/tsa/statespace/varmax.py
index b5c20f581b1..c3234eee54f 100644
--- a/statsmodels/tsa/statespace/varmax.py
+++ b/statsmodels/tsa/statespace/varmax.py
@@ -313,12 +313,7 @@ def start_params(self):
 
         # A. Run a multivariate regression to get beta estimates
         endog = pd.DataFrame(self.endog.copy())
-        # Pandas < 0.13 didn't support the same type of DataFrame interpolation
-        # TODO remove this now that we have dropped support for Pandas < 0.13
-        try:
-            endog = endog.interpolate()
-        except TypeError:
-            pass
+        endog = endog.interpolate()
         endog = endog.fillna(method='backfill').values
         exog = None
         if self.k_trend > 0 and self.k_exog > 0:
@@ -677,7 +672,7 @@ def update(self, params, **kwargs):
 
         # - Trend
         if self.k_trend > 0:
-            # If we didn't set the intercept above, zero it out so we can
+            # If we did not set the intercept above, zero it out so we can
             # just += later
             if not self.mle_regression:
                 zero = np.array(0, dtype=params.dtype)
@@ -920,7 +915,7 @@ def get_prediction(self, start=None, end=None, dynamic=False, index=None,
                  ' required. `exog` argument ignored.', ValueWarning)
 
         # If we had exog, then the last predicted_state has been set to NaN
-        # since we didn't have the appropriate exog to create it. Then, if
+        # since we did not have the appropriate exog to create it. Then, if
         # we are forecasting, we now have new exog that we need to put into
         # the existing state_intercept array (and we will take it out, below)
         if last_intercept is not None:
diff --git a/statsmodels/tsa/stattools.py b/statsmodels/tsa/stattools.py
index b0847c6dad0..307d68373ee 100644
--- a/statsmodels/tsa/stattools.py
+++ b/statsmodels/tsa/stattools.py
@@ -919,7 +919,7 @@ def ccf(x, y, unbiased=True):
     series it is recommended to use fft convolution instead.
 
     If unbiased is true, the denominator for the autocovariance is adjusted
-    but the autocorrelation is not an unbiased estimtor.
+    but the autocorrelation is not an unbiased estimator.
 
     '''
     x = array_like(x, 'x')
@@ -1350,7 +1350,7 @@ def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic',
         crit = [np.nan] * 3  # 2010 critical values not available
     else:
         crit = mackinnoncrit(N=k_vars, regression=trend, nobs=nobs - 1)
-        #  nobs - 1, the -1 is to match egranger in Stata, I don't know why.
+        #  nobs - 1, the -1 is to match egranger in Stata, I do not know why.
         #  TODO: check nobs or df = nobs - k
 
     pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars)
@@ -1367,7 +1367,7 @@ def _safe_arma_fit(y, order, model_kw, trend, fit_kw, start_params=None):
         return
 
     except ValueError as error:
-        if start_params is not None:  # don't recurse again
+        if start_params is not None:  # do not recurse again
             # user supplied start_params only get one chance
             return
         # try a little harder, should be handled in fit really
diff --git a/statsmodels/tsa/tests/arima.do b/statsmodels/tsa/tests/arima.do
index 7a9c3035d58..3b9e13f3370 100644
--- a/statsmodels/tsa/tests/arima.do
+++ b/statsmodels/tsa/tests/arima.do
@@ -26,7 +26,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -73,7 +73,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -120,7 +120,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -167,7 +167,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
diff --git a/statsmodels/tsa/tests/arima112.do b/statsmodels/tsa/tests/arima112.do
index ec88ab79835..818db931134 100644
--- a/statsmodels/tsa/tests/arima112.do
+++ b/statsmodels/tsa/tests/arima112.do
@@ -20,7 +20,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -68,7 +68,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -115,7 +115,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -163,7 +163,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
diff --git a/statsmodels/tsa/tests/arima211.do b/statsmodels/tsa/tests/arima211.do
index 0d2759e34b2..9207ed44216 100644
--- a/statsmodels/tsa/tests/arima211.do
+++ b/statsmodels/tsa/tests/arima211.do
@@ -20,7 +20,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -66,7 +66,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -74,7 +74,7 @@ predict y, y
 predict resid, resid
 predict yr, yr
 predict mse, mse
-/* can't do stdp without a constant
+/* cannot do stdp without a constant
 predict stdp, stdp */
 estat ic
 
@@ -116,7 +116,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
@@ -162,7 +162,7 @@ mat k_ma=e(ma_max)
 mat params=e(b)
 mat cov_params=e(V)
 
-// don't append because you'll rewrite the bunch class
+// do not append because you'll rewrite the bunch class
 // mat2nparray llf nobs k k_exog sigma chi2 df_model k_ar k_ma params cov_params, saving("/home/skipper/statsmodels/statsmodels-skipper/statsmodels/tsa/tests/results/arima_results.py") format("%16.0g") replace
 
 predict xb
diff --git a/statsmodels/tsa/tests/results/results_arima.py b/statsmodels/tsa/tests/results/results_arima.py
index 3cf968da759..b3cb041a10b 100644
--- a/statsmodels/tsa/tests/results/results_arima.py
+++ b/statsmodels/tsa/tests/results/results_arima.py
@@ -181,7 +181,7 @@ def __init__(self, method="mle"):
                 204.7540, 207.2581, 208.6696, 210.5136, 214.1399,
                 215.5866, 220.6022, 218.2942, 212.6785, 213.2020,
                 215.2081]
-            # forecasting isn't any different for css
+            # forecasting is not any different for css
             # except you lose the first p+1 observations for in-sample
             # these results are from x-12 arima
             self.forecast = forecast_results['fc111c_css'][-25:]
@@ -260,7 +260,7 @@ def __init__(self, method="mle"):
             self.cov_params = (
                 cov_params + cov_params.T - np.diag(np.diag(cov_params)))
             self.bse = np.sqrt(np.diag(self.cov_params))
-            # forecasting isn't any different for css
+            # forecasting is not any different for css
             # except you lose the first p+1 observations for in-sample
             self.forecast = forecast_results['fc111c_css'][-25:]
             self.forecasterr = forecast_results['fc111cse_css'][-25:]
diff --git a/statsmodels/tsa/tests/results/stl-fixed.f90 b/statsmodels/tsa/tests/results/stl-fixed.f90
index ef8b3e6a578..b9c00427f06 100644
--- a/statsmodels/tsa/tests/results/stl-fixed.f90
+++ b/statsmodels/tsa/tests/results/stl-fixed.f90
@@ -185,7 +185,7 @@ subroutine rwts(y, n, fit, rw)
     do i = 1, n
         rw(i) = abs(y(i) - fit(i))
     end do
-    ! TODO: psort doesn't work correctly, reorder mid
+    ! TODO: psort does not work correctly, reorder mid
     mid(2) = n / 2 + 1
     mid(1) = n - mid(2) + 1
     call psort(rw, n, mid, 2)
diff --git a/statsmodels/tsa/tests/test_arima.py b/statsmodels/tsa/tests/test_arima.py
index 2459574e1c2..f7809350642 100644
--- a/statsmodels/tsa/tests/test_arima.py
+++ b/statsmodels/tsa/tests/test_arima.py
@@ -609,7 +609,7 @@ def setup_class(cls):
          cls.res1.forecast_err,
          conf_int)              = cls.res1.forecast(25)
         #cls.res1.forecast_res_dyn = cls.res1.predict(start=164, end=226, typ='levels', dynamic=True)
-        #TODO: fix the indexing for the end here, I don't think this is right
+        #TODO: fix the indexing for the end here, I do not think this is right
         # if we're going to treat it like indexing
         # the forecast from 2005Q1 through 2009Q4 is indices
         # 184 through 227 not 226
@@ -672,7 +672,7 @@ def setup_class(cls):
         # cls.res1.forecast_err,
         # conf_int)              = cls.res1.forecast(25)
         #cls.res1.forecast_res_dyn = cls.res1.predict(start=164, end=226, typ='levels', dynamic=True)
-        #TODO: fix the indexing for the end here, I don't think this is right
+        #TODO: fix the indexing for the end here, I do not think this is right
         # if we're going to treat it like indexing
         # the forecast from 2005Q1 through 2009Q4 is indices
         # 184 through 227 not 226
@@ -680,7 +680,7 @@ def setup_class(cls):
         # predictions
         #cls.res1.forecast_res_dyn = self.predict(start=164, end=164+63,
         #                                         typ='levels', dynamic=True)
-        # since we got from gretl don't have linear prediction in differences
+        # since we got from gretl do not have linear prediction in differences
         cls.decimal_arroots = 3
         cls.decimal_maroots = 2
         cls.decimal_t = 1
@@ -873,7 +873,7 @@ def test_arima_predict_mle():
     fv = res1.predict(start, end, typ='levels')
     assert_almost_equal(fv, fc[start:end+1], DECIMAL_3)
     # start >nobs, end >nobs 2009q4 - 2015q4
-    #NOTE: this raises but shouldn't, dynamic forecasts could start
+    #NOTE: this raises but should not, dynamic forecasts could start
     #one period out
     start, end = 203, 227
     fv = res1.predict(start, end, typ='levels')
@@ -1627,7 +1627,7 @@ def test_1dexog():
 
 
 def test_arima_predict_bug():
-    #predict_start_date wasn't getting set on start = None
+    #predict_start_date was not getting set on start = None
     from statsmodels.datasets import sunspots
     dta = sunspots.load_pandas().data.SUNACTIVITY
     dta.index = pd.date_range(start='1700', end='2009', freq='A')[:309]
@@ -1828,7 +1828,7 @@ def test_arimax():
     X.iloc[0] = 0
     res = ARIMA(y, (2, 1, 1), X).fit(disp=False)
 
-    # gretl won't estimate this - looks like maybe a bug on their part,
+    # gretl will not estimate this - looks like maybe a bug on their part,
     # but we can just fine, we're close to Stata's answer
     # from Stata
     params = [19.5656863783347, 0.32653841355833396198,
@@ -2304,7 +2304,7 @@ def test_arima_exog_predict():
     mod_002 = ARIMA(np.asarray(data_sample['loginv']), (0, 0, 2),
                     exog=np.asarray(data_sample[['loggdp', 'logcons']]))
 
-    # doesn't converge with default starting values
+    # does not converge with default starting values
     start_params = np.concatenate((res.params[[0, 1, 2, 4]], [0]))
     res_002 = mod_002.fit(start_params=start_params, disp=0,
                           solver='bfgs', maxiter=5000)
@@ -2350,7 +2350,7 @@ def test_arima_fit_multiple_calls():
         mod.fit(disp=0, start_params=[np.mean(y), .1, .1, .1])
     assert_equal(mod.exog_names,  ['const', 'ar.L1.y', 'ma.L1.y', 'ma.L2.y'])
 
-    mod.exog = None  # FIXME: this shouldn't be necessary
+    mod.exog = None  # FIXME: this should not be necessary
     with pytest.warns(HessianInversionWarning, match="no bse or cov"):
         res = mod.fit(disp=0, start_params=[np.mean(y), .1, .1, .1])
     assert_equal(mod.exog_names,  ['const', 'ar.L1.y', 'ma.L1.y', 'ma.L2.y'])
@@ -2364,7 +2364,7 @@ def test_arima_fit_multiple_calls():
     mod.fit(disp=0, start_params=[np.mean(y)])
     assert_equal(mod.exog_names,  ['const'])
 
-    mod.exog = None  # FIXME: this shouldn't be necessary
+    mod.exog = None  # FIXME: this should not be necessary
     res = mod.fit(disp=0, start_params=[np.mean(y)])
     assert_equal(mod.exog_names,  ['const'])
 
diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py
index 01439e37f09..b77762b1c1e 100644
--- a/statsmodels/tsa/tests/test_stattools.py
+++ b/statsmodels/tsa/tests/test_stattools.py
@@ -88,7 +88,7 @@ def setup_class(cls):
         cls.critvalues = [-4.007, -3.437, -3.137]
 
 
-# FIXME: don't leave commented-out
+# FIXME: do not leave commented-out
 #class TestADFConstantTrendSquared(CheckADF):
 #    """
 #    """
@@ -192,7 +192,7 @@ def test_qstat(self):
     # FIXME: enable/xfail/skip or delete
     #def pvalue(self):
     #    pass
-    # NOTE: shouldn't need testing if Q stat is correct
+    # NOTE: should not need testing if Q stat is correct
 
 
 class TestACF_FFT(CheckCorrGram):
@@ -315,7 +315,7 @@ def test_tstat(self):
         assert_almost_equal(self.coint_t,self.teststat, DECIMAL_4)
 
 
-# this doesn't produce the old results anymore
+# this does not produce the old results anymore
 class TestCoint_t(CheckCoint):
     """
     Get AR(1) parameter on residuals
diff --git a/statsmodels/tsa/tsatools.py b/statsmodels/tsa/tsatools.py
index 92a6965680c..18b5a825898 100644
--- a/statsmodels/tsa/tsatools.py
+++ b/statsmodels/tsa/tsatools.py
@@ -215,7 +215,7 @@ def add_lag(x, col=None, lags=1, drop=False, insert=True):
             else:
                 last_names.pop(last_names.index(col))
 
-        if first_names: # only do this if x isn't "empty"
+        if first_names: # only do this if x is not "empty"
             # Workaround to avoid NumPy FutureWarning
             _x = recarray_select(x, first_names)
             first_arr = nprf.append_fields(_x[lags:], tmp_names, ndlags.T,
diff --git a/statsmodels/tsa/varma_process.py b/statsmodels/tsa/varma_process.py
index 4fe606cc5d0..553725f56dd 100644
--- a/statsmodels/tsa/varma_process.py
+++ b/statsmodels/tsa/varma_process.py
@@ -5,7 +5,7 @@
 Author: josef-pktd
 License: BSD
 
-This is a new version, I didn't look at the old version again, but similar
+This is a new version, I did not look at the old version again, but similar
 ideas.
 
 not copied/cleaned yet:
@@ -37,8 +37,8 @@
 def varfilter(x, a):
     '''apply an autoregressive filter to a series x
 
-    Warning: I just found out that convolve doesn't work as I
-       thought, this likely doesn't work correctly for
+    Warning: I just found out that convolve does not work as I
+       thought, this likely does not work correctly for
        nvars>3
 
 
diff --git a/statsmodels/tsa/vector_ar/svar_model.py b/statsmodels/tsa/vector_ar/svar_model.py
index 824c70f54ea..c51928bc8e4 100644
--- a/statsmodels/tsa/vector_ar/svar_model.py
+++ b/statsmodels/tsa/vector_ar/svar_model.py
@@ -274,7 +274,7 @@ def loglike(self, params):
         is estimated
         """
 
-        #TODO: this doesn't look robust if A or B is None
+        #TODO: this does not look robust if A or B is None
         A = self.A
         B = self.B
         A_mask = self.A_mask
@@ -600,7 +600,7 @@ def __init__(self, endog, endog_lagged, params, sigma_u, lag_order,
         coefs = reshaped.swapaxes(1, 2).copy()
 
         #SVAR components
-        #TODO: if you define these here, you don't also have to define
+        #TODO: if you define these here, you do not also have to define
         #them in SVAR process, but I left them for now -ss
         self.A = A
         self.B = B
diff --git a/statsmodels/tsa/vector_ar/tests/results/results_var.py b/statsmodels/tsa/vector_ar/tests/results/results_var.py
index 3eda619b21d..99e4838e8ac 100644
--- a/statsmodels/tsa/vector_ar/tests/results/results_var.py
+++ b/statsmodels/tsa/vector_ar/tests/results/results_var.py
@@ -50,7 +50,7 @@ def __init__(self):
         self.llf = 1962.572126661708
 
         self.chi2_1 = 75.44775165699033
-        # don't know how they calculate this;  it's not -2 * (ll1 - ll0)
+        # do not know how they calculate this;  it's not -2 * (ll1 - ll0)
 
         self.chi2_2 = 33.19878716815366
         self.chi2_3 = 83.90568280242312
diff --git a/statsmodels/tsa/vector_ar/tests/test_var.py b/statsmodels/tsa/vector_ar/tests/test_var.py
index 82ad12eca11..2c3674231a0 100644
--- a/statsmodels/tsa/vector_ar/tests/test_var.py
+++ b/statsmodels/tsa/vector_ar/tests/test_var.py
@@ -53,7 +53,7 @@ def bivariate_var_result(bivariate_var_data):
 
 
 class CheckVAR(object):  # FIXME: not inherited, so these tests are never run!
-    # just so pylint won't complain
+    # just so pylint will not complain
     res1 = None
     res2 = None
 
diff --git a/statsmodels/tsa/vector_ar/tests/test_var_jmulti.py b/statsmodels/tsa/vector_ar/tests/test_var_jmulti.py
index 76be37335a5..5dcf85ed932 100644
--- a/statsmodels/tsa/vector_ar/tests/test_var_jmulti.py
+++ b/statsmodels/tsa/vector_ar/tests/test_var_jmulti.py
@@ -190,7 +190,7 @@ def test_ols_det_terms():
 
             err_msg = build_err_msg(ds, dt_s, "PARAMETER MATRICES EXOG")
             det_key_ref = "Deterministic term"
-            # If there are no det. terms, just make sure we don't compute any:
+            # If there are no det. terms, just make sure we do not compute any:
             if det_key_ref not in results_ref[ds][dt_s]["est"].keys():
                 assert_((results_sm[ds][dt_s].coefs_exog.size == 0 and
                          results_sm[ds][dt_s].stderr_dt.size == 0 and
@@ -341,7 +341,7 @@ def test_causality():  # test Granger- and instantaneous causality
                 g_t_obt_str = granger_sm_str.test_statistic
                 assert_allclose(g_t_obt_str, g_t_obt, 1e-07, 0, False,
                                 err_msg_g_t + " - sequences of integers and ".upper() +
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1 or len(caused_ind) == 1:
@@ -352,7 +352,7 @@ def test_causality():  # test Granger- and instantaneous causality
                     g_t_obt_single = granger_sm_single_ind.test_statistic
                     assert_allclose(g_t_obt_single, g_t_obt, 1e-07, 0, False,
                                     err_msg_g_t + " - list of int and int as ".upper() +
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
                 # test p-value for Granger non-causality:
                 g_p_obt = granger_sm_ind.pvalue
@@ -363,14 +363,14 @@ def test_causality():  # test Granger- and instantaneous causality
                 g_p_obt_str = granger_sm_str.pvalue
                 assert_allclose(g_p_obt_str, g_p_obt, 1e-07, 0, False,
                                 err_msg_g_t + " - sequences of integers and ".upper() +
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1:
                     g_p_obt_single = granger_sm_single_ind.pvalue
                     assert_allclose(g_p_obt_single, g_p_obt, 1e-07, 0, False,
                         err_msg_g_t + " - list of int and int as ".upper() + \
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
                 # test instantaneous causality ################################
                 inst_sm_ind = results_sm[ds][dt].test_inst_causality(
@@ -386,7 +386,7 @@ def test_causality():  # test Granger- and instantaneous causality
                 t_obt_str = inst_sm_str.test_statistic
                 assert_allclose(t_obt_str, t_obt, 1e-07, 0, False,
                                 err_msg_i_t + " - sequences of integers and ".upper() +
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1:
@@ -395,7 +395,7 @@ def test_causality():  # test Granger- and instantaneous causality
                     t_obt_single = inst_sm_single_ind.test_statistic
                     assert_allclose(t_obt_single, t_obt, 1e-07, 0, False,
                                     err_msg_i_t + " - list of int and int as ".upper() +
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
                 # test p-value for instantaneous non-causality
                 p_obt = results_sm[ds][dt].test_inst_causality(
@@ -407,7 +407,7 @@ def test_causality():  # test Granger- and instantaneous causality
                 p_obt_str = inst_sm_str.pvalue
                 assert_allclose(p_obt_str, p_obt, 1e-07, 0, False,
                                 err_msg_i_p + " - sequences of integers and ".upper() +
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1:
@@ -416,7 +416,7 @@ def test_causality():  # test Granger- and instantaneous causality
                     p_obt_single = inst_sm_single_ind.pvalue
                     assert_allclose(p_obt_single, p_obt, 1e-07, 0, False,
                                     err_msg_i_p + " - list of int and int as ".upper() +
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
 
 def test_impulse_response():
@@ -481,7 +481,7 @@ def test_normality():
             obt_pvalue = obtained.pvalue
             des_pvalue = results_ref[ds][dt]["test_norm"]["joint_pvalue"]
             assert_allclose(obt_pvalue, des_pvalue, rtol, atol, False, err_msg)
-            # call methods to assure they don't raise exceptions
+            # call methods to assure they do not raise exceptions
             obtained.summary()
             str(obtained)  # __str__()
 
diff --git a/statsmodels/tsa/vector_ar/tests/test_vecm.py b/statsmodels/tsa/vector_ar/tests/test_vecm.py
index 281317a2d16..a0e1647ccbf 100644
--- a/statsmodels/tsa/vector_ar/tests/test_vecm.py
+++ b/statsmodels/tsa/vector_ar/tests/test_vecm.py
@@ -909,7 +909,7 @@ def test_granger_causality():
                 granger_sm_str = results_sm[ds][
                     dt].test_granger_causality(caused_names, causing_names)
 
-                # call methods to assure they don't raise exceptions
+                # call methods to assure they do not raise exceptions
                 granger_sm_ind.summary()
                 str(granger_sm_ind)  # __str__()
                 assert_(granger_sm_ind == granger_sm_str)  # __eq__()
@@ -929,7 +929,7 @@ def test_granger_causality():
                 g_t_obt_str = granger_sm_str.test_statistic
                 assert_allclose(g_t_obt_str, g_t_obt, 1e-07, 0, False,
                                 err_msg_g_t + " - sequences of integers and ".upper() +
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1 or len(caused_ind) == 1:
@@ -940,7 +940,7 @@ def test_granger_causality():
                     g_t_obt_single = granger_sm_single_ind.test_statistic
                     assert_allclose(g_t_obt_single, g_t_obt, 1e-07, 0, False,
                                     err_msg_g_t + " - list of int and int as ".upper() +
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
                 # test p-value for Granger non-causality:
                 g_p_obt = granger_sm_ind.pvalue
@@ -951,14 +951,14 @@ def test_granger_causality():
                 g_p_obt_str = granger_sm_str.pvalue
                 assert_allclose(g_p_obt_str, g_p_obt, 1e-07, 0, False,
                     err_msg_g_t + " - sequences of integers and ".upper() + \
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1:
                     g_p_obt_single = granger_sm_single_ind.pvalue
                     assert_allclose(g_p_obt_single, g_p_obt, 1e-07, 0, False,
                         err_msg_g_t + " - list of int and int as ".upper() + \
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
 
 def test_inst_causality():  # test instantaneous causality
@@ -993,7 +993,7 @@ def test_inst_causality():  # test instantaneous causality
                     dt].test_inst_causality(causing_ind)
                 inst_sm_str = results_sm[ds][dt].test_inst_causality(
                     causing_names)
-                # call methods to assure they don't raise exceptions
+                # call methods to assure they do not raise exceptions
                 inst_sm_ind.summary()
                 str(inst_sm_ind)  # __str__()
                 assert_(inst_sm_ind == inst_sm_str)  # __eq__()
@@ -1013,7 +1013,7 @@ def test_inst_causality():  # test instantaneous causality
                 t_obt_str = inst_sm_str.test_statistic
                 assert_allclose(t_obt_str, t_obt, 1e-07, 0, False,
                                 err_msg_i_t + " - sequences of integers and ".upper() +
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1:
@@ -1022,7 +1022,7 @@ def test_inst_causality():  # test instantaneous causality
                     t_obt_single = inst_sm_single_ind.test_statistic
                     assert_allclose(t_obt_single, t_obt, 1e-07, 0, False,
                                     err_msg_i_t + " - list of int and int as ".upper() +
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
                 # test p-value for instantaneous non-causality
                 p_obt = results_sm[ds][dt].test_inst_causality(
@@ -1034,7 +1034,7 @@ def test_inst_causality():  # test instantaneous causality
                 p_obt_str = inst_sm_str.pvalue
                 assert_allclose(p_obt_str, p_obt, 1e-07, 0, False,
                                 err_msg_i_p + " - sequences of integers and ".upper() +
-                                "strings as arguments don't yield the same result!".upper())
+                                "strings as arguments do not yield the same result!".upper())
                 # check if int (e.g. 0) as index and list of int ([0]) yield
                 # the same result:
                 if len(causing_ind) == 1:
@@ -1043,7 +1043,7 @@ def test_inst_causality():  # test instantaneous causality
                     p_obt_single = inst_sm_single_ind.pvalue
                     assert_allclose(p_obt_single, p_obt, 1e-07, 0, False,
                                     err_msg_i_p + " - list of int and int as ".upper() +
-                                    "argument don't yield the same result!".upper())
+                                    "argument do not yield the same result!".upper())
 
 
 def test_impulse_response():
@@ -1127,7 +1127,7 @@ def test_lag_order_selection():
                 if exog_coint:
                     assert_equal(getattr(obtained_all_exog_coint, ic),
                                  getattr(obtained_all, ic), "WITH EXOG_COINT" + err_msg)
-            # call methods to assure they don't raise exceptions
+            # call methods to assure they do not raise exceptions
             obtained_all.summary()
             str(obtained_all)  # __str__()
 
@@ -1165,7 +1165,7 @@ def test_normality():
             obt_pvalue = obtained.pvalue
             des_pvalue = results_ref[ds][dt]["test_norm"]["joint_pvalue"]
             assert_allclose(obt_pvalue, des_pvalue, rtol, atol, False, err_msg)
-            # call methods to assure they don't raise exceptions
+            # call methods to assure they do not raise exceptions
             obtained.summary()
             str(obtained)  # __str__()
             assert_(obtained == obtained_exog)  # __eq__()
@@ -1229,7 +1229,7 @@ def test_whiteness():
                                             "P-VALUE (ADJUSTED TEST)")
             desired = results_ref[ds][dt]["whiteness"]["p-value adjusted"]
             assert_allclose(obtained.pvalue, desired, rtol, atol, False, err_msg)
-            # call methods to assure they don't raise exceptions
+            # call methods to assure they do not raise exceptions
             obtained.summary()
             str(obtained)  # __str__()
             assert_(obtained == obtained_exog)  # __eq__()
diff --git a/statsmodels/tsa/vector_ar/var_model.py b/statsmodels/tsa/vector_ar/var_model.py
index f4aad746257..12305644f1e 100644
--- a/statsmodels/tsa/vector_ar/var_model.py
+++ b/statsmodels/tsa/vector_ar/var_model.py
@@ -655,7 +655,7 @@ def _estimate_var(self, lags, offset=0, trend='c'):
         trend : string or None
             As per above
         """
-        # have to do this again because select_order doesn't call fit
+        # have to do this again because select_order does not call fit
         self.k_trend = k_trend = util.get_trendorder(trend)
 
         if offset < 0:  # pragma: no cover
diff --git a/statsmodels/tsa/vector_ar/vecm.py b/statsmodels/tsa/vector_ar/vecm.py
index 780312fb075..402555b24a2 100644
--- a/statsmodels/tsa/vector_ar/vecm.py
+++ b/statsmodels/tsa/vector_ar/vecm.py
@@ -188,7 +188,7 @@ def _deterministic_to_exog(deterministic, seasons, nobs_tot, first_season=0,
     Returns
     -------
     exog : ndarray or None
-        None, if the function's arguments don't contain deterministic terms.
+        None, if the function's arguments do not contain deterministic terms.
         Otherwise, an ndarray representing these deterministic terms.
     """
     exogs = []
@@ -793,7 +793,7 @@ class VECM(tsbase.TimeSeriesModel):
         Combinations of these are possible (e.g. ``"cili"`` or ``"colo"`` for
         linear trend with intercept). When using a constant term you have to
         choose whether you want to restrict it to the cointegration relation
-        (i.e. ``"ci"``) or leave it unrestricted (i.e. ``"co"``). Don't use
+        (i.e. ``"ci"``) or leave it unrestricted (i.e. ``"co"``). Do not use
         both ``"ci"`` and ``"co"``. The same applies for ``"li"`` and ``"lo"``
         when using a linear term. See the Notes-section for more information.
     seasons : int, default: 0
@@ -1658,7 +1658,7 @@ def predict(self, steps=5, alpha=None, exog_fc=None, exog_coint_fc=None):
                              "argument!")
         if self.exog is None and exog_fc is not None:
             raise ValueError("This VECMResult-instance's exog attribute is "
-                             "None. Please don't pass a non-None value as the "
+                             "None. Please do not pass a non-None value as the "
                              "method's exog_fc-argument.")
         if exog_fc is not None and exog_fc.shape[0] < steps:
             raise ValueError("The argument exog_fc must have at least steps "
@@ -1670,7 +1670,7 @@ def predict(self, steps=5, alpha=None, exog_fc=None, exog_coint_fc=None):
                              "exog_coint_fc argument!")
         if self.exog_coint is None and exog_coint_fc is not None:
             raise ValueError("This VECMResult-instance's exog_coint attribute "
-                             "is None. Please don't pass a non-None value as "
+                             "is None. Please do not pass a non-None value as "
                              "the method's exog_coint_fc-argument.")
         if exog_coint_fc is not None and exog_coint_fc.shape[0] < steps - 1:
             raise ValueError("The argument exog_coint_fc must have at least "
diff --git a/statsmodels/tsa/x13.py b/statsmodels/tsa/x13.py
index 6dcfec6d329..79177742136 100644
--- a/statsmodels/tsa/x13.py
+++ b/statsmodels/tsa/x13.py
@@ -471,7 +471,7 @@ def x13_arima_analysis(endog, maxorder=(2, 1), maxdiff=(2, 1), diff=None,
     trend = _convert_out_to_series(trend, endog.index, 'trend')
     irregular = _convert_out_to_series(irregular, endog.index, 'irregular')
 
-    # NOTE: there isn't likely anything in stdout that's not in results
+    # NOTE: there is not likely anything in stdout that's not in results
     #       so may be safe to just suppress and remove it
     if not retspec:
         res = X13ArimaAnalysisResult(observed=endog, results=results,
diff --git a/tools/backport_pr.py b/tools/backport_pr.py
index d0d2be6da3b..b18d2b2697f 100644
--- a/tools/backport_pr.py
+++ b/tools/backport_pr.py
@@ -163,7 +163,7 @@ def should_backport(labels=None, milestone=None):
     if len(sys.argv) < 3:
         branch = sys.argv[1]
         already = already_backported(branch)
-        # NOTE: change this to the label you've used for marking a backport
+        # NOTE: change this to the label you have used for marking a backport
         should = should_backport(milestone="0.5.1")
         print("The following PRs should be backported:")
         to_backport = []
diff --git a/tools/ci/azure_template.yml b/tools/ci/azure_template.yml
index bd7e6b297c9..903e278cf8c 100644
--- a/tools/ci/azure_template.yml
+++ b/tools/ci/azure_template.yml
@@ -4,7 +4,7 @@
 # https://docs.microsoft.com/azure/devops/pipelines/languages/python
 
 parameters:
-  # defaults for any parameters that aren't specified
+  # defaults for any parameters that are not specified
   name: ''
   vmImage: ''
 
diff --git a/tools/estmat2nparray.ado b/tools/estmat2nparray.ado
index 29521b838d6..6eab3272255 100644
--- a/tools/estmat2nparray.ado
+++ b/tools/estmat2nparray.ado
@@ -12,7 +12,7 @@
 * add estimation results from e(), e(scalars) and e(macros), not the matrices in e
 * make estimation result optional
 * add aliases for params_table
-* don't split col or row names if only 1 - changed my mind: always list
+* do not split col or row names if only 1 - changed my mind: always list
 
 * Issues
 * ------
diff --git a/tools/generate_formula_api.py b/tools/generate_formula_api.py
index 244d90991a6..a4f9b3415d6 100755
--- a/tools/generate_formula_api.py
+++ b/tools/generate_formula_api.py
@@ -35,7 +35,7 @@ def iter_subclasses(cls, _seen=None, template_classes=[]):
     for sub in subs:
         if sub not in _seen and sub.__name__ not in template_classes:
             _seen.add(sub)
-            # we don't want to yield the templates, but we do want to
+            # we do not want to yield the templates, but we do want to
             # recurse on them
             yield sub
         for sub in iter_subclasses(sub, _seen, template_classes):
@@ -49,7 +49,7 @@ def write_formula_api(directory):
                         'TimeSeriesModel',
                         # this class should really be deleted
                         'ARIMAProcess',
-                        # these need some more work, so don't expose them
+                        # these need some more work, so do not expose them
                         'ARIMA', 'VAR', 'SVAR', 'AR', 'NBin', 'NbReg', 'ARMA',
                         ]
 
diff --git a/tools/km_cox1.do b/tools/km_cox1.do
index 17c58c2bdf6..dba4ef04b80 100644
--- a/tools/km_cox1.do
+++ b/tools/km_cox1.do
@@ -30,7 +30,7 @@ ereturn list,
 matlist e(V)
 matlist e(p)
 
-* the next doesn't work
+* the next does not work
 * predict predictall, hr xb stdp basesurv basechazard basehc mgale csnell deviance ldisplace lmax effects
 /* generate in python:
 >>> for i in 'hr xb stdp basesurv basechazard basehc mgale csnell deviance ldisplace lmax effects'.split(): print 'predict %s, %s' % (i,i)
diff --git a/tools/matplotlibrc.qt4 b/tools/matplotlibrc.qt4
index 83783d977bb..a33c54b72a4 100644
--- a/tools/matplotlibrc.qt4
+++ b/tools/matplotlibrc.qt4
@@ -150,7 +150,7 @@ font.monospace       : Andale Mono, Nimbus Mono L, Courier New, Courier, Fixed,
                             # type1cm, textcomp. Adobe Postscript (PSSNFS) font packages
                             # may also be loaded, depending on your font settings
 
-#text.dvipnghack : None      # some versions of dvipng don't handle alpha
+#text.dvipnghack : None      # some versions of dvipng do not handle alpha
                              # channel properly.  Use True to correct
                              # and flush ~/.matplotlib/tex.cache
                              # before testing and False to force
@@ -359,7 +359,7 @@ figure.subplot.hspace  : 0.5    # the amount of height reserved for white space
 
 # Event keys to interact with figures/plots via keyboard.
 # Customize these settings according to your needs.
-# Leave the field(s) empty if you don't need a key-map. (i.e., fullscreen : '')
+# Leave the field(s) empty if you do not need a key-map. (i.e., fullscreen : '')
 
 keymap.fullscreen : f               # toggling
 keymap.home : h, r, home            # home or reset mnemonic
diff --git a/tools/matplotlibrc.qt5 b/tools/matplotlibrc.qt5
index 0e7d2a2c8f1..a22bed65612 100644
--- a/tools/matplotlibrc.qt5
+++ b/tools/matplotlibrc.qt5
@@ -186,7 +186,7 @@ backend.qt5 : PyQt5
                             # type1cm, textcomp. Adobe Postscript (PSSNFS) font packages
                             # may also be loaded, depending on your font settings
 
-#text.dvipnghack : None      # some versions of dvipng don't handle alpha
+#text.dvipnghack : None      # some versions of dvipng do not handle alpha
                              # channel properly.  Use True to correct
                              # and flush ~/.matplotlib/tex.cache
                              # before testing and False to force
@@ -468,7 +468,7 @@ backend.qt5 : PyQt5
 
 # Event keys to interact with figures/plots via keyboard.
 # Customize these settings according to your needs.
-# Leave the field(s) empty if you don't need a key-map. (i.e., fullscreen : '')
+# Leave the field(s) empty if you do not need a key-map. (i.e., fullscreen : '')
 
 #keymap.fullscreen : f               # toggling
 #keymap.home : h, r, home            # home or reset mnemonic
diff --git a/tools/update_web.py b/tools/update_web.py
index b0766830beb..1482841b6c9 100755
--- a/tools/update_web.py
+++ b/tools/update_web.py
@@ -88,7 +88,7 @@
 # ------------ FUNCTIONS ------------
 
 def create_virtualenv():
-    # make a virtualenv for installation if it doesn't exist
+    # make a virtualenv for installation if it does not exist
     # and easy_install sphinx
     if not os.path.exists(virtual_dir):
         retcode = subprocess.call(['/home/skipper/.local/bin/virtualenv',
@@ -105,7 +105,7 @@ def create_virtualenv():
 
 def create_update_gitdir():
     """
-    Creates a directory for local repo if it doesn't exist,
+    Creates a directory for local repo if it does not exist,
     updates repo otherwise.
     """
     if not os.path.exists(gitdname):
@@ -115,7 +115,7 @@ def create_update_gitdir():
             msg = """There was a problem cloning the repo"""
             raise Exception(msg)
     else:
-        # directory exists, can't pull if you're not on a branch
+        # directory exists, cannot pull if you're not on a branch
         # just delete it and clone again. Lazy but clean solution.
         shutil.rmtree(gitdname)
         create_update_gitdir()
@@ -172,7 +172,7 @@ def newdir(dirs):
                    os.path.isfile(os.path.join(dname, i))])
     newdir = newdirs.difference(dirs)
     if len(newdir) != 1:
-        msg = ("There was more than one directory created. Don't know what "
+        msg = ("There was more than one directory created. Do not know what "
                "to delete.")
         raise Exception(msg)
     newdir = newdir.pop()
@@ -359,7 +359,7 @@ def main():
         except Exception:
             msg += traceback.format_exc()
 
-    if msg == '':  # if it doesn't something went wrong and was caught above
+    if msg == '':  # if it does not something went wrong and was caught above
         email_me()
     else:
         email_me(msg)
diff --git a/versioneer.py b/versioneer.py
index 64fea1c8927..135278ef71e 100644
--- a/versioneer.py
+++ b/versioneer.py
@@ -318,7 +318,7 @@ def get_root():
         # Certain runtime workflows (setup.py install/develop in a setuptools
         # tree) execute all dependencies in a single python process, so
         # "versioneer" may be imported multiple times, and python's shared
-        # module-import table will cache the first one. So we can't use
+        # module-import table will cache the first one. So we cannot use
         # os.path.dirname(__file__), as that will find whichever
         # versioneer.py was first imported, even in later projects.
         me = os.path.realpath(os.path.abspath(__file__))
@@ -546,14 +546,14 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     if verbose:
         print("Tried directories %%s but none started with prefix %%s" %%
               (str(rootdirs), parentdir_prefix))
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+    raise NotThisMethod("rootdir does not start with parentdir_prefix")
 
 
 @register_vcs_handler("git", "get_keywords")
 def git_get_keywords(versionfile_abs):
     """Extract version information from the given file."""
     # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
+    # keywords. When used from setup.py, we do not want to import _version.py,
     # so we do it with a regexp instead. This function is not used from
     # _version.py.
     keywords = {}
@@ -638,7 +638,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     """Get version from 'git describe' in the root of the source tree.
 
     This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
+    expanded, and _version.py has not already been rewritten with a short
     version string, meaning we're inside a checked out source tree.
     """
     GITS = ["git"]
@@ -653,7 +653,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         raise NotThisMethod("'git rev-parse --git-dir' returned error")
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    # if there is not one, this yields HEX[-dirty] (no NUM)
     describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
                                           "--always", "--long",
                                           "--match", "%%s*" %% tag_prefix],
@@ -697,9 +697,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         full_tag = mo.group(1)
         if not full_tag.startswith(tag_prefix):
             if verbose:
-                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                fmt = "tag '%%s' does not start with prefix '%%s'"
                 print(fmt %% (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+            pieces["error"] = ("tag '%%s' does not start with prefix '%%s'"
                                %% (full_tag, tag_prefix))
             return pieces
         pieces["closest-tag"] = full_tag[len(tag_prefix):]
@@ -726,7 +726,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
 
 def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
+    """Return a + if we do not already have one, else return a ."""
     if "+" in pieces.get("closest-tag", ""):
         return "."
     return "+"
@@ -778,7 +778,7 @@ def render_pep440_post(pieces):
 
     The ".dev0" means dirty. Note that .dev0 sorts backwards
     (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
+    but you should not be releasing software with -dirty anyways.
 
     Exceptions:
     1: no tags. 0.postDISTANCE[.dev0]
@@ -898,7 +898,7 @@ def get_versions():
     """Get version information or return default if unable to do so."""
     # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
     # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # py2exe/bbfreeze/non-CPython implementations do not do __file__, in which
     # case we can only use expanded keywords.
 
     cfg = get_config()
@@ -945,7 +945,7 @@ def get_versions():
 def git_get_keywords(versionfile_abs):
     """Extract version information from the given file."""
     # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
+    # keywords. When used from setup.py, we do not want to import _version.py,
     # so we do it with a regexp instead. This function is not used from
     # _version.py.
     keywords = {}
@@ -1030,7 +1030,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     """Get version from 'git describe' in the root of the source tree.
 
     This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
+    expanded, and _version.py has not already been rewritten with a short
     version string, meaning we're inside a checked out source tree.
     """
     GITS = ["git"]
@@ -1045,7 +1045,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         raise NotThisMethod("'git rev-parse --git-dir' returned error")
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    # if there is not one, this yields HEX[-dirty] (no NUM)
     describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
                                           "--always", "--long",
                                           "--match", "%s*" % tag_prefix],
@@ -1089,9 +1089,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         full_tag = mo.group(1)
         if not full_tag.startswith(tag_prefix):
             if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
+                fmt = "tag '%s' does not start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+            pieces["error"] = ("tag '%s' does not start with prefix '%s'"
                                % (full_tag, tag_prefix))
             return pieces
         pieces["closest-tag"] = full_tag[len(tag_prefix):]
@@ -1177,7 +1177,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     if verbose:
         print("Tried directories %s but none started with prefix %s" %
               (str(rootdirs), parentdir_prefix))
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+    raise NotThisMethod("rootdir does not start with parentdir_prefix")
 
 
 SHORT_VERSION_PY = """
@@ -1227,7 +1227,7 @@ def write_to_version_file(filename, versions):
 
 
 def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
+    """Return a + if we do not already have one, else return a ."""
     if "+" in pieces.get("closest-tag", ""):
         return "."
     return "+"
@@ -1279,7 +1279,7 @@ def render_pep440_post(pieces):
 
     The ".dev0" means dirty. Note that .dev0 sorts backwards
     (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
+    but you should not be releasing software with -dirty anyways.
 
     Exceptions:
     1: no tags. 0.postDISTANCE[.dev0]
@@ -1534,7 +1534,7 @@ def run(self):
     #  setuptools/develop -> ?
     #  pip install:
     #   copies source tree to a tempdir before running egg_info/etc
-    #   if .git isn't copied too, 'git describe' will fail
+    #   if .git is not copied too, 'git describe' will fail
     #   then does setup.py bdist_wheel, or sometimes setup.py install
     #  setup.py egg_info -> ?
 
@@ -1561,7 +1561,7 @@ def run(self):
 
     if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
         from cx_Freeze.dist import build_exe as _build_exe
-        # nczeczulin reports that py2exe won't like the pep440-style string
+        # nczeczulin reports that py2exe will not like the pep440-style string
         # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
         # setup(console=[{
         #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
@@ -1734,12 +1734,12 @@ def do_setup():
         else:
             print(" %s unmodified" % ipy)
     else:
-        print(" %s doesn't exist, ok" % ipy)
+        print(" %s does not exist, ok" % ipy)
         ipy = None
 
     # Make sure both the top-level "versioneer.py" and versionfile_source
     # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
-    # they'll be copied into source distributions. Pip won't be able to
+    # they'll be copied into source distributions. Pip will not be able to
     # install the package without this.
     manifest_in = os.path.join(root, "MANIFEST.in")
     simple_includes = set()
@@ -1751,7 +1751,7 @@ def do_setup():
                         simple_includes.add(include)
     except EnvironmentError:
         pass
-    # That doesn't cover everything MANIFEST.in can do
+    # That does not cover everything MANIFEST.in can do
     # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
     # it might give some false negatives. Appending redundant 'include'
     # lines is safe, though.