diff --git a/.coveragerc b/.coveragerc index 9936b513a6..05b5fdb6ca 100644 --- a/.coveragerc +++ b/.coveragerc @@ -5,6 +5,7 @@ branch = True omit = */results/* */_version.py + */conftest.py [report] # Regexes for lines to exclude from consideration diff --git a/doc/source/changes.rst b/doc/source/changes.rst index e1fb6208d0..3c474c34d3 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -1,13 +1,15 @@ Change Log ---------- -Since 4.14 -========== -* Removed support for Python 3.5 inline with NEP-29 (:issue:`222`) +Version 4.15 +============ +* Blackened the code. +* Added McElroy's and Berndt's measures of system fit (:issue:`215`). +* Removed support for Python 3.5 inline with NEP-29 (:issue:`222`). Version 4.14 ============ -* Fixed issue where datasets were not installed with wheels (:issue:`217`) +* Fixed issue where datasets were not installed with wheels (:issue:`217`). * Switched to property-cached to inherit cached property from property (:issue:`211`). * Removed all use of :class:`pandas.Panel` (:issue:`211`). diff --git a/doc/source/conf.py b/doc/source/conf.py index 3e5f77a11f..7946bd5feb 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -21,6 +21,9 @@ # import sys # sys.path.insert(0, os.path.abspath('.')) +import glob +import os +import hashlib from distutils.version import LooseVersion import linearmodels @@ -38,54 +41,78 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', - 'numpydoc', - 'sphinx_autodoc_typehints', - 'sphinx.ext.autosummary', - 'sphinx.ext.extlinks', - 'sphinx.ext.doctest', - 'IPython.sphinxext.ipython_console_highlighting', - 'IPython.sphinxext.ipython_directive', - 'nbsphinx', - 'sphinx_material' - ] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "numpydoc", + "sphinx_autodoc_typehints", + "sphinx.ext.autosummary", + "sphinx.ext.extlinks", + "sphinx.ext.doctest", + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", + "nbsphinx", + "sphinx_material", +] try: import sphinxcontrib.spelling # noqa: F401 except ImportError as err: # noqa: F841 pass else: - extensions.append('sphinxcontrib.spelling') + extensions.append("sphinxcontrib.spelling") -spelling_word_list_filename = ['spelling_wordlist.txt', 'names_wordlist.txt'] +spelling_word_list_filename = ["spelling_wordlist.txt", "names_wordlist.txt"] spelling_ignore_pypi_package_names = True add_module_names = False +# Copy over notebooks from examples to docs for build +files = glob.glob("../../examples/*.ipynb") + glob.glob("../../examples/*.png") +for file_to_copy in files: + full_name = os.path.split(file_to_copy)[-1] + folder, file_name = full_name.split("_") + if not file_name.endswith("ipynb"): + file_name = "_".join((folder, file_name)) + out_dir = os.path.join(folder, "examples") + if not os.path.exists(out_dir): + os.makedirs(out_dir, exist_ok=True) + out_file = os.path.join(out_dir, file_name) + existing_hash = "" + with open(file_to_copy, "rb") as example: + example_file = example.read() + example_hash = hashlib.md5(example_file).hexdigest() + if os.path.exists(out_file): + with open(out_file, "rb") as existing: + existing_hash = hashlib.md5(existing.read()).hexdigest() + if existing_hash != example_hash: + print(f"Copying {file_to_copy} to {out_file}") + with open(out_file, "wb") as out: + out.write(example_file) + # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'linearmodels' -copyright = '2017, Kevin Sheppard' -author = 'Kevin Sheppard' +project = "linearmodels" +copyright = "2017, Kevin Sheppard" +author = "Kevin Sheppard" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -94,14 +121,14 @@ # The short X.Y version. # The short X.Y version. version = LooseVersion(linearmodels.__version__) -if '+' in version.version: +if "+" in version.version: version = linearmodels.__version__ - version = version.replace('.dirty', '') - version = version.split('+') - commits, tag = version[1].split('.') + version = version.replace(".dirty", "") + version = version.split("+") + commits, tag = version[1].split(".") version = version[0] - short_tag = ' (+{0})'.format(commits) - tag = ' (+' + commits + ', ' + tag + ')' + short_tag = " (+{0})".format(commits) + tag = " (+" + commits + ", " + tag + ")" short_version = version + short_tag version = version + tag else: @@ -137,40 +164,40 @@ html_theme_path = sphinx_material.html_theme_path() html_context = sphinx_material.get_html_context() -html_theme = 'sphinx_material' +html_theme = "sphinx_material" # Adds an HTML table visitor to apply Bootstrap table classes # sphinx_material theme options (see theme.conf for more information) html_theme_options = { - 'base_url': 'http://bashtage.github.io/linearmodels/', - 'repo_url': 'https://github.com/bashtage/linearmodels/', - 'repo_name': 'linearmodels', + "base_url": "http://bashtage.github.io/linearmodels/", + "repo_url": "https://github.com/bashtage/linearmodels/", + "repo_name": "linearmodels", # Set the name of the project to appear in the sidebar "nav_title": project + " " + short_version, - 'globaltoc_depth': 2, - 'globaltoc_collapse': True, - 'globaltoc_includehidden': True, - 'theme_color': '#2196f3', - 'color_primary': 'blue', - 'color_accent': 'orange', - 'html_minify': True, - 'css_minify': True, - 'master_doc': False, - 'heroes': { - 'index': 'Models for panel data, system regression, instrumental \ - variables and asset pricing.' - } + "globaltoc_depth": 2, + "globaltoc_collapse": True, + "globaltoc_includehidden": True, + "theme_color": "#2196f3", + "color_primary": "blue", + "color_accent": "orange", + "html_minify": True, + "css_minify": True, + "master_doc": False, + "heroes": { + "index": "Models for panel data, system regression, instrumental \ + variables and asset pricing." + }, } -html_favicon = 'images/favicon.ico' -html_logo = 'images/bw-logo.svg' +html_favicon = "images/favicon.ico" +html_logo = "images/bw-logo.svg" # Register the theme as an extension to generate a sitemap.xml # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] html_sidebars = { "**": ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"] @@ -183,15 +210,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -201,18 +225,20 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'linearmodels.tex', 'linearmodels Documentation', - 'Kevin Sheppard', 'manual'), + ( + master_doc, + "linearmodels.tex", + "linearmodels Documentation", + "Kevin Sheppard", + "manual", + ), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'linearmodels', 'linearmodels Documentation', - [author], 1) -] +man_pages = [(master_doc, "linearmodels", "linearmodels Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -220,23 +246,29 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'linearmodels', 'linearmodels Documentation', - author, 'linearmodels', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "linearmodels", + "linearmodels Documentation", + author, + "linearmodels", + "One line description of project.", + "Miscellaneous", + ), ] # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'statsmodels': ('https://www.statsmodels.org/dev/', None), - 'matplotlib': ('https://matplotlib.org/', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None), - 'python': ('https://docs.python.org/3', None), - 'numpy': ('https://docs.scipy.org/doc/numpy', None), - 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), - 'xarray': ('https://xarray.pydata.org/en/stable/', None) + "statsmodels": ("https://www.statsmodels.org/dev/", None), + "matplotlib": ("https://matplotlib.org/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), + "python": ("https://docs.python.org/3", None), + "numpy": ("https://docs.scipy.org/doc/numpy", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "xarray": ("https://xarray.pydata.org/en/stable/", None), } -extlinks = {'issue': ('https://github.com/bashtage/linearmodels/issues/%s', 'GH')} +extlinks = {"issue": ("https://github.com/bashtage/linearmodels/issues/%s", "GH")} doctest_global_setup = """ @@ -256,4 +288,54 @@ napoleon_use_admonition_for_references = True autosummary_generate = True -autoclass_content = 'class' \ No newline at end of file +autoclass_content = "class" + +# Create xrefs +numpydoc_use_autodoc_signature = True +numpydoc_xref_param_type = True +numpydoc_class_members_toctree = False +numpydoc_xref_aliases = { + "Figure": "matplotlib.figure.Figure", + "Axes": "matplotlib.axes.Axes", + "AxesSubplot": "matplotlib.axes.Axes", + "DataFrame": "pandas.DataFrame", + "Series": "pandas.Series", + "BetweenOLS": "linearmodels.panel.model.BetweenOLS", + "FamaMacBeth": "linearmodels.panel.model.FamaMacBeth", + "FirstDifferenceOLS": "linearmodels.panel.model.FirstDifferenceOLS", + "IV2SLS": "linearmodels.iv.model.IV2SLS", + "IV3SLS": "linearmodels.system.model.IV3SLS", + "IVGMM": "linearmodels.iv.model.IVGMM", + "IVGMMCUE": "linearmodels.iv.model.IVGMMCUE", + "IVLIML": "linearmodels.iv.model.IVLIML", + "IVSystemGMM": "linearmodels.system.model.IVSystemGMM", + "LinearFactorModel": "linearmodels.asset_pricing.model.LinearFactorModel", + "LinearFactorModelGMM": "linearmodels.asset_pricing.model.LinearFactorModelGMM", + "OLS": "linearmodels.iv.model.OLS", + "PanelOLS": "linearmodels.panel.model.PanelOLS", + "PooledOLS": "linearmodels.panel.model.PooledOLS", + "RandomEffects": "linearmodels.panel.model.RandomEffects", + "SUR": "linearmodels.system.model.SUR", + "TradedFactorModel": "linearmodels.asset_pricing.model.TradedFactorModel", + "AbsorbingLSResults": "linearmodels.iv.absorbing.AbsorbingLSResults", + "FirstStageResults": "linearmodels.iv.results.FirstStageResults", + "IVGMMResults": "linearmodels.iv.results.IVGMMResults", + "IVModelComparison": "linearmodels.iv.results.IVModelComparison", + "IVResults": "linearmodels.iv.results.IVResults", + "InvalidTestStatistic": "linearmodels.utility.InvalidTestStatistic", + "OLSResults": "linearmodels.iv.results.OLSResults", + "WaldTestStatistic": "linearmodels.utility.WaldTestStatistic", + "PanelEffectsResults": "linearmodels.panel.results.PanelEffectsResults", + "PanelModelComparison": "linearmodels.panel.results.PanelModelComparison", + "PanelResults": "linearmodels.panel.results.PanelResults", + "RandomEffectsResults": "linearmodels.panel.results.RandomEffectsResults", + "GMMSystemResults": "linearmodels.system.results.GMMSystemResults", + "Summary": "linearmodels.compat.statsmodels.Summary", + "SystemEquationResult": "linearmodels.system.results.SystemEquationResult", + "SystemResults": "linearmodels.system.results.SystemResults", + "GMMFactorModelResults": "linearmodels.asset_pricing.results.GMMFactorModelResults", + "LinearFactorModelResults": "linearmodels.asset_pricing.results.LinearFactorModelResults", + "PanelData": "linearmodels.panel.data.PanelData", + "IVData": "linearmodels.iv.data.IVData", + "AttrDict": "linearmodels.utility.AttrDict", +} diff --git a/doc/source/system/mathematical-detail.lyx b/doc/source/system/mathematical-detail.lyx index 625f647608..0cd60630c4 100644 --- a/doc/source/system/mathematical-detail.lyx +++ b/doc/source/system/mathematical-detail.lyx @@ -1068,5 +1068,244 @@ ic weighting formula immediately above. \end_layout +\begin_layout Subsection* +System Measures of Fit ( +\begin_inset Formula $R^{2}$ +\end_inset + +) +\end_layout + +\begin_layout Standard +Most measures of fit for systems of equations assume that all equations + contains a constant (or equivalent). + Caution is needed when interpreting if equations exclude constant terms. +\end_layout + +\begin_layout Subsubsection* +\noindent +Overall +\begin_inset Formula $R^{2}$ +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +The overall +\begin_inset Formula $R^{2}$ +\end_inset + + is defined as +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Formula +\[ +R^{2}=1-\frac{\sum_{i=1}^{K}SSR_{i}}{\sum_{i=1}^{K}TSS_{i}} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +where +\begin_inset Formula $TSS_{i}$ +\end_inset + + is centered if equation +\begin_inset Formula $i$ +\end_inset + + contains a constant and uncentered if it does not. + When all equations contain constants, it is identical to Judge's measure. +\end_layout + +\begin_layout Subsubsection* +\noindent +McElroy +\end_layout + +\begin_layout Standard +\noindent +McElroy's (1977) measure is defined as +\begin_inset Formula +\[ +R^{2}=1-\frac{\epsilon^{\prime}\Omega^{-1}\epsilon}{Y^{\prime}\left(\Sigma^{-1}\otimes\left(I_{N}-\frac{\iota\iota^{\prime}}{N}\right)\right)Y} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +where +\begin_inset Formula $\iota$ +\end_inset + + is a +\begin_inset Formula $N$ +\end_inset + + by 1 vector of 1s. + This is implemented as +\begin_inset Formula +\[ +R^{2}=1-\frac{\sum_{i=1}^{N}\sum_{j=1}^{K}\hat{\xi}_{ij}^{2}}{\sum_{i=1}^{N}\sum_{j=1}^{K}\hat{\eta}_{ij}^{2}} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +where +\begin_inset Formula +\begin{align*} +\hat{\xi} & =\hat{E}\hat{\Sigma}^{-\frac{1}{2}}\\ +\hat{E} & =\left[\begin{array}{cccc} +\hat{\epsilon}_{1} & \hat{\epsilon}_{2} & \ldots & \hat{\epsilon}_{N}\end{array}\right] +\end{align*} + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +and +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Formula +\begin{align*} +\hat{\eta} & =\tilde{Y}\hat{\Sigma}^{-\frac{1}{2}}\\ +\tilde{Y} & =\left[\begin{array}{cccc} +Y_{1}-\hat{\mu}_{1} & Y_{2}-\hat{\mu}_{2} & \ldots & Y_{N}-\hat{\mu}_{N}\end{array}\right]. +\end{align*} + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +where the vector of mean parameters is estimated by fitting a SURE to the + data (using user specified weights, if provided) where +\begin_inset Formula $X_{i}=\iota$ +\end_inset + + contains only a constant. + Greene provides an alternative formulation of this measure as +\begin_inset Formula +\[ +R^{2}=1-\frac{K}{\mathrm{tr}\left(\hat{\Sigma}^{-1}\hat{\Psi}\right)} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +where +\begin_inset Formula $\hat{\Psi}=N^{-1}\tilde{Y}^{\prime}\tilde{Y}$ +\end_inset + + is the covariance of the demeaned data. +\end_layout + +\begin_layout Subsubsection* +\noindent +Berndt +\end_layout + +\begin_layout Standard +\noindent +Berndt's measure is defined as +\begin_inset Formula +\[ +R^{2}=1-\frac{\left|\hat{\Sigma}\right|}{\left|\hat{\Psi}\right|}. +\] + +\end_inset + + +\end_layout + +\begin_layout Subsubsection* +\noindent +Judge +\end_layout + +\begin_layout Standard +\noindent +Judge's measure is the naive OLS +\begin_inset Formula $R^{2}$ +\end_inset + + for the system, +\end_layout + +\begin_layout Standard +\noindent +\begin_inset Formula +\[ +R^{2}=1-\frac{\sum_{i=1}^{N}\sum_{j=1}^{K}\hat{E}_{ij}^{2}}{\sum_{i=1}^{N}\sum_{j=1}^{K}\tilde{Y}_{ij}^{2}}. +\] + +\end_inset + + +\end_layout + +\begin_layout Subsubsection* +\noindent +Dhrymes +\end_layout + +\begin_layout Standard +\noindent +Dhrymes' measure of fit is a weighted average of the +\begin_inset Formula $R^{2}$ +\end_inset + + of each equation, +\begin_inset Formula +\[ +R^{2}=\sum_{i=1}^{K}R_{i}^{2}\frac{\hat{\Psi}_{ii}}{\mathrm{tr}\left(\hat{\Psi}\right)} +\] + +\end_inset + + +\end_layout + +\begin_layout Standard +\noindent +where +\begin_inset Formula $R_{i}^{2}$ +\end_inset + + is the coefficient of determination from equation +\begin_inset Formula $i$ +\end_inset + +. +\end_layout + \end_body \end_document diff --git a/doc/source/system/mathematical-detail.txt b/doc/source/system/mathematical-detail.txt index 04281b0ef4..113ef3a026 100644 --- a/doc/source/system/mathematical-detail.txt +++ b/doc/source/system/mathematical-detail.txt @@ -1,7 +1,3 @@ - -Formulas and Mathematical Detail -================================ - Seemingly Unrelated Regression (SUR/SURE) ----------------------------------------- @@ -217,6 +213,8 @@ hypothesis testing of parameters. It can also lead to more precise parameter estimates if some residuals are conditionally homoskedastic and regressors differ across equations. +.. _basic-notation-1: + Basic Notation ~~~~~~~~~~~~~~ @@ -372,3 +370,86 @@ cases these should be the same, and so the covariance of the estimated parameters will simplify to .. math:: \widehat{Var\left(\hat{\beta}\right)}=N^{-1}\left(\frac{X^{\prime}Z}{N}\hat{W}^{-1}\frac{Z^{\prime}X}{N}\right)^{-1}. + +System Measures of Fit (:math:`R^{2}`) +-------------------------------------- + +Most measures of fit for systems of equations assume that all equations +contains a constant (or equivalent). Caution is needed when interpreting +if equations exclude constant terms. + +Overall :math:`R^{2}` +~~~~~~~~~~~~~~~~~~~~~ + +The overall :math:`R^{2}` is defined as + +.. math:: R^{2}=1-\frac{\sum_{i=1}^{K}SSR_{i}}{\sum_{i=1}^{K}TSS_{i}} + +where :math:`TSS_{i}` is centered if equation :math:`i` contains a +constant and uncentered if it does not. When all equations contain +constants, it is identical to Judge’s measure. + +McElroy +~~~~~~~ + +McElroy’s (1977) measure is defined as + +.. math:: R^{2}=1-\frac{\epsilon^{\prime}\Omega^{-1}\epsilon}{Y^{\prime}\left(\Sigma^{-1}\otimes\left(I_{N}-\frac{\iota\iota^{\prime}}{N}\right)\right)Y} + +where :math:`\iota` is a :math:`N` by 1 vector of 1s. This is +implemented as + +.. math:: R^{2}=1-\frac{\sum_{i=1}^{N}\sum_{j=1}^{K}\hat{\xi}_{ij}^{2}}{\sum_{i=1}^{N}\sum_{j=1}^{K}\hat{\eta}_{ij}^{2}} + +where + +.. math:: + + \begin{aligned} + \hat{\xi} & =\hat{E}\hat{\Sigma}^{-\frac{1}{2}}\\ + \hat{E} & =\left[\begin{array}{cccc} + \hat{\epsilon}_{1} & \hat{\epsilon}_{2} & \ldots & \hat{\epsilon}_{N}\end{array}\right]\end{aligned} + +and + +.. math:: + + \begin{aligned} + \hat{\eta} & =\tilde{Y}\hat{\Sigma}^{-\frac{1}{2}}\\ + \tilde{Y} & =\left[\begin{array}{cccc} + Y_{1}-\hat{\mu}_{1} & Y_{2}-\hat{\mu}_{2} & \ldots & Y_{N}-\hat{\mu}_{N}\end{array}\right].\end{aligned} + +where the vector of mean parameters is estimated by fitting a SURE to +the data (using user specified weights, if provided) where +:math:`X_{i}=\iota` contains only a constant. Greene provides an +alternative formulation of this measure as + +.. math:: R^{2}=1-\frac{K}{\mathrm{tr}\left(\hat{\Sigma}^{-1}\hat{\Psi}\right)} + +where :math:`\hat{\Psi}=N^{-1}\tilde{Y}^{\prime}\tilde{Y}` is the +covariance of the demeaned data. + +Berndt +~~~~~~ + +Berndt’s measure is defined as + +.. math:: R^{2}=1-\frac{\left|\hat{\Sigma}\right|}{\left|\hat{\Psi}\right|}. + +Judge +~~~~~ + +Judge’s measure is the naive OLS :math:`R^{2}` for the system, + +.. math:: R^{2}=1-\frac{\sum_{i=1}^{N}\sum_{j=1}^{K}\hat{E}_{ij}^{2}}{\sum_{i=1}^{N}\sum_{j=1}^{K}\tilde{Y}_{ij}^{2}}. + +Dhrymes +~~~~~~~ + +Dhrymes’ measure of fit is a weighted average of the :math:`R^{2}` of +each equation, + +.. math:: R^{2}=\sum_{i=1}^{K}R_{i}^{2}\frac{\hat{\Psi}_{ii}}{\mathrm{tr}\left(\hat{\Psi}\right)} + +where :math:`R_{i}^{2}` is the coefficient of determination from +equation :math:`i`. diff --git a/linearmodels/__init__.py b/linearmodels/__init__.py index 99195c1e49..2ddf443a6c 100644 --- a/linearmodels/__init__.py +++ b/linearmodels/__init__.py @@ -41,31 +41,47 @@ FirstDifferenceOLS, PanelOLS, PooledOLS, RandomEffects) from linearmodels.system import IV3SLS, SUR, IVSystemGMM + from ._version import get_versions OLS = _OLS -WARN_ON_MISSING = os.environ.get('LINEARMODELS_WARN_ON_MISSING', True) -WARN_ON_MISSING = False if WARN_ON_MISSING in ('0', 'False') else True -DROP_MISSING = os.environ.get('LINEARMODELS_DROP_MISSING', True) -DROP_MISSING = False if DROP_MISSING in ('0', 'False') else True - -__all__ = ['PooledOLS', 'PanelOLS', 'FirstDifferenceOLS', 'BetweenOLS', - 'RandomEffects', - 'FamaMacBeth', - 'IVLIML', 'IVGMM', 'IVGMMCUE', 'IV2SLS', 'OLS', - 'SUR', 'IV3SLS', 'IVSystemGMM', - 'LinearFactorModel', 'LinearFactorModelGMM', 'TradedFactorModel', - 'WARN_ON_MISSING', 'DROP_MISSING'] +WARN_ON_MISSING = os.environ.get("LINEARMODELS_WARN_ON_MISSING", True) +WARN_ON_MISSING = False if WARN_ON_MISSING in ("0", "False") else True +DROP_MISSING = os.environ.get("LINEARMODELS_DROP_MISSING", True) +DROP_MISSING = False if DROP_MISSING in ("0", "False") else True + +__all__ = [ + "PooledOLS", + "PanelOLS", + "FirstDifferenceOLS", + "BetweenOLS", + "RandomEffects", + "FamaMacBeth", + "IVLIML", + "IVGMM", + "IVGMMCUE", + "IV2SLS", + "OLS", + "SUR", + "IV3SLS", + "IVSystemGMM", + "LinearFactorModel", + "LinearFactorModelGMM", + "TradedFactorModel", + "WARN_ON_MISSING", + "DROP_MISSING", +] def test(extra_args=None, exit=True, append=True): import sys + try: import pytest except ImportError: raise ImportError("Need pytest to run tests") - cmd = ['--tb=short', '--disable-pytest-warnings'] + cmd = ["--tb=short", "--disable-pytest-warnings"] if extra_args: if not isinstance(extra_args, list): extra_args = [extra_args] @@ -75,11 +91,11 @@ def test(extra_args=None, exit=True, append=True): cmd = extra_args pkg = os.path.dirname(__file__) cmd = [pkg] + cmd - print("running: pytest {}".format(' '.join(cmd))) + print("running: pytest {}".format(" ".join(cmd))) status = pytest.main(cmd) if exit: sys.exit(status) -__version__ = get_versions()['version'] +__version__ = get_versions()["version"] del get_versions diff --git a/linearmodels/_version.py b/linearmodels/_version.py index 2865a507d9..6430286885 100644 --- a/linearmodels/_version.py +++ b/linearmodels/_version.py @@ -68,8 +68,7 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None @@ -77,10 +76,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -117,16 +119,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -182,7 +190,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -191,7 +199,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) + tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -199,19 +207,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -226,8 +241,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -235,10 +249,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -261,17 +284,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -280,10 +302,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -294,13 +318,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -331,8 +355,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -446,11 +469,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -470,9 +495,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } def get_versions(): @@ -486,8 +515,7 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass @@ -496,13 +524,16 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): + for i in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -516,6 +547,10 @@ def get_versions(): except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/linearmodels/asset_pricing/__init__.py b/linearmodels/asset_pricing/__init__.py index 0b61427ff9..9733e05e74 100644 --- a/linearmodels/asset_pricing/__init__.py +++ b/linearmodels/asset_pricing/__init__.py @@ -2,4 +2,4 @@ LinearFactorModelGMM, TradedFactorModel) -__all__ = ['TradedFactorModel', 'LinearFactorModelGMM', 'LinearFactorModel'] +__all__ = ["TradedFactorModel", "LinearFactorModelGMM", "LinearFactorModel"] diff --git a/linearmodels/asset_pricing/covariance.py b/linearmodels/asset_pricing/covariance.py index 258e0fadeb..48fd266699 100644 --- a/linearmodels/asset_pricing/covariance.py +++ b/linearmodels/asset_pricing/covariance.py @@ -8,7 +8,6 @@ class _HACMixin(object): - def __init__(self): self._bandwidth = None # pragma: no cover self._moments = None # pragma: no cover @@ -32,10 +31,10 @@ def bandwidth(self): def _check_kernel(self, kernel): if not isinstance(kernel, str): - raise TypeError('kernel must be the name of a kernel') + raise TypeError("kernel must be the name of a kernel") self._kernel = kernel.lower() if self._kernel not in KERNEL_LOOKUP: - raise ValueError('Unknown kernel') + raise ValueError("Unknown kernel") def _check_bandwidth(self, bandwidth): self._bandwidth = bandwidth @@ -43,9 +42,9 @@ def _check_bandwidth(self, bandwidth): try: bandwidth = float(bandwidth) except (TypeError, ValueError): - raise TypeError('bandwidth must be either None or a float') + raise TypeError("bandwidth must be either None or a float") if bandwidth < 0: - raise ValueError('bandwidth must be non-negative.') + raise ValueError("bandwidth must be non-negative.") def _kernel_cov(self, z): nobs = z.shape[0] @@ -79,16 +78,20 @@ class HeteroskedasticCovariance(object): Degree of freedom value ot use if debiasing """ - def __init__(self, xe, *, jacobian=None, inv_jacobian=None, - center=True, debiased=False, df=0): + def __init__( + self, xe, *, jacobian=None, inv_jacobian=None, center=True, debiased=False, df=0 + ): self._moments = self._xe = xe self._jac = jacobian self._inv_jac = inv_jacobian self._center = center - if (jacobian is None and inv_jacobian is None) \ - or (jacobian is not None and inv_jacobian is not None): - raise ValueError('One and only one of jacobian or inv_jacobian must be provided.') + if (jacobian is None and inv_jacobian is None) or ( + jacobian is not None and inv_jacobian is not None + ): + raise ValueError( + "One and only one of jacobian or inv_jacobian must be provided." + ) self._debiased = debiased self._df = df if jacobian is not None: @@ -100,11 +103,11 @@ def __str__(self): return self.__class__.__name__ def __repr__(self): - return self.__str__() + ', id: {0}'.format(hex(id(self))) + return self.__str__() + ", id: {0}".format(hex(id(self))) @property def config(self): - return {'type': self.__class__.__name__} + return {"type": self.__class__.__name__} @property def s(self): @@ -113,7 +116,7 @@ def s(self): Returns ------- - s : ndarray + ndarray Covariance of the scores or moment conditions """ xe = self._xe @@ -150,7 +153,7 @@ def cov(self): Returns ------- - c : ndarray + ndarray Parameter covariance """ s = self.s @@ -198,26 +201,38 @@ class KernelCovariance(HeteroskedasticCovariance, _HACMixin): linearmodels.iv.covariance.kernel_weight_quadratic_spectral """ - def __init__(self, xe, *, jacobian=None, inv_jacobian=None, - kernel='bartlett', bandwidth=None, center=True, - debiased=False, df=0): - super(KernelCovariance, self).__init__(xe, jacobian=jacobian, - inv_jacobian=inv_jacobian, - center=center, - debiased=debiased, df=df) + def __init__( + self, + xe, + *, + jacobian=None, + inv_jacobian=None, + kernel="bartlett", + bandwidth=None, + center=True, + debiased=False, + df=0 + ): + super(KernelCovariance, self).__init__( + xe, + jacobian=jacobian, + inv_jacobian=inv_jacobian, + center=center, + debiased=debiased, + df=df, + ) self._check_kernel(kernel) self._check_bandwidth(bandwidth) def __str__(self): - descr = ', Kernel: {0}, Bandwidth: {1}'.format(self._kernel, - self.bandwidth) + descr = ", Kernel: {0}, Bandwidth: {1}".format(self._kernel, self.bandwidth) return self.__class__.__name__ + descr @property def config(self): out = super(KernelCovariance, self).config - out['kernel'] = self._kernel - out['bandwidth'] = self.bandwidth + out["kernel"] = self._kernel + out["bandwidth"] = self.bandwidth return out @property @@ -227,7 +242,7 @@ def s(self): Returns ------- - s : ndarray + ndarray Covariance of the scores or moment conditions """ xe = self._xe @@ -263,7 +278,7 @@ def w(self, moments): Returns ------- - w : ndarray + ndarray Weighting matrix computed from moment conditions """ if self._center: @@ -290,7 +305,7 @@ class KernelWeight(HeteroskedasticWeight, _HACMixin): Non-negative integer bandwidth """ - def __init__(self, moments, center=True, kernel='bartlett', bandwidth=None): + def __init__(self, moments, center=True, kernel="bartlett", bandwidth=None): super(KernelWeight, self).__init__(moments, center=center) self._check_kernel(kernel) self._check_bandwidth(bandwidth) @@ -306,7 +321,7 @@ def w(self, moments): Returns ------- - w : ndarray + ndarray Weighting matrix computed from moment conditions """ if self._center: diff --git a/linearmodels/asset_pricing/model.py b/linearmodels/asset_pricing/model.py index 95724f893e..69016d8f3f 100644 --- a/linearmodels/asset_pricing/model.py +++ b/linearmodels/asset_pricing/model.py @@ -20,14 +20,14 @@ def callback_factory(obj, args, disp=1): - d = {'iter': 0} + d = {"iter": 0} disp = int(disp) def callback(params): fval = obj(params, *args) - if disp > 0 and (d['iter'] % disp == 0): - print('Iteration: {0}, Objective: {1}'.format(d['iter'], fval)) - d['iter'] += 1 + if disp > 0 and (d["iter"] % disp == 0): + print("Iteration: {0}, Objective: {1}".format(d["iter"], fval)) + d["iter"] += 1 return callback @@ -37,9 +37,9 @@ class TradedFactorModel(object): Parameters ---------- - portfolios : array-like + portfolios : array_like Test portfolio returns (nobs by nportfolio) - factors : array-like + factors : array_like Priced factor returns (nobs by nfactor) Notes @@ -61,8 +61,8 @@ class TradedFactorModel(object): """ def __init__(self, portfolios, factors): - self.portfolios = IVData(portfolios, var_name='portfolio') - self.factors = IVData(factors, var_name='factor') + self.portfolios = IVData(portfolios, var_name="portfolio") + self.factors = IVData(factors, var_name="factor") self._name = self.__class__.__name__ self._formula = None self._validate_data() @@ -70,19 +70,21 @@ def __init__(self, portfolios, factors): def __str__(self): out = self.__class__.__name__ f, p = self.factors.shape[1], self.portfolios.shape[1] - out += ' with {0} factors, {1} test portfolios'.format(f, p) + out += " with {0} factors, {1} test portfolios".format(f, p) return out def __repr__(self): - return self.__str__() + '\nid: {0}'.format(hex(id(self))) + return self.__str__() + "\nid: {0}".format(hex(id(self))) def _drop_missing(self): data = (self.portfolios, self.factors) missing = np.any(np.c_[[dh.isnull for dh in data]], 0) if any(missing): if all(missing): - raise ValueError('All observations contain missing data. ' - 'Model cannot be estimated.') + raise ValueError( + "All observations contain missing data. " + "Model cannot be estimated." + ) self.portfolios.drop(missing) self.factors.drop(missing) missing_warning(missing) @@ -93,23 +95,31 @@ def _validate_data(self): p = self.portfolios.ndarray f = self.factors.ndarray if p.shape[0] != f.shape[0]: - raise ValueError('The number of observations in portfolios and ' - 'factors is not the same.') + raise ValueError( + "The number of observations in portfolios and " + "factors is not the same." + ) self._drop_missing() p = self.portfolios.ndarray f = self.factors.ndarray if has_constant(p)[0]: - raise ValueError('portfolios must not contains a constant or ' - 'equivalent and must not have rank\n' - 'less than the dimension of the smaller shape.') + raise ValueError( + "portfolios must not contains a constant or " + "equivalent and must not have rank\n" + "less than the dimension of the smaller shape." + ) if has_constant(f)[0]: - raise ValueError('factors must not contain a constant or equivalent.') + raise ValueError("factors must not contain a constant or equivalent.") if np.linalg.matrix_rank(f) < f.shape[1]: - raise ValueError('Model cannot be estimated. factors do not have full column rank.') + raise ValueError( + "Model cannot be estimated. factors do not have full column rank." + ) if p.shape[0] < (f.shape[1] + 1): - raise ValueError('Model cannot be estimated. portfolios must have factors + 1 or ' - 'more returns to\nestimate the model parameters.') + raise ValueError( + "Model cannot be estimated. portfolios must have factors + 1 or " + "more returns to\nestimate the model parameters." + ) @property def formula(self): @@ -121,16 +131,26 @@ def formula(self, value): @staticmethod def _prepare_data_from_formula(formula, data, portfolios): - na_action = NAAction(on_NA='raise', NA_types=[]) + na_action = NAAction(on_NA="raise", NA_types=[]) orig_formula = formula if portfolios is not None: - factors = dmatrix(formula + ' + 0', data, return_type='dataframe', NA_action=na_action) + factors = dmatrix( + formula + " + 0", data, return_type="dataframe", NA_action=na_action + ) else: - formula = formula.split('~') - portfolios = dmatrix(formula[0].strip() + ' + 0', data, - return_type='dataframe', NA_action=na_action) - factors = dmatrix(formula[1].strip() + ' + 0', data, - return_type='dataframe', NA_action=na_action) + formula = formula.split("~") + portfolios = dmatrix( + formula[0].strip() + " + 0", + data, + return_type="dataframe", + NA_action=na_action, + ) + factors = dmatrix( + formula[1].strip() + " + 0", + data, + return_type="dataframe", + NA_action=na_action, + ) return factors, portfolios, orig_formula @@ -143,12 +163,12 @@ def from_formula(cls, formula, data, *, portfolios=None): Patsy formula modified for the syntax described in the notes data : DataFrame DataFrame containing the variables used in the formula - portfolios : array-like, optional + portfolios : array_like, optional Portfolios to be used in the model Returns ------- - model : TradedFactorModel + TradedFactorModel Model instance Notes @@ -172,12 +192,14 @@ def from_formula(cls, formula, data, *, portfolios=None): >>> formula = 'MktRF + SMB + HML' >>> mod = TradedFactorModel.from_formula(formula, data, portfolios=portfolios) """ - factors, portfolios, formula = cls._prepare_data_from_formula(formula, data, portfolios) + factors, portfolios, formula = cls._prepare_data_from_formula( + formula, data, portfolios + ) mod = cls(portfolios, factors) mod.formula = formula return mod - def fit(self, cov_type='robust', debiased=True, **cov_config): + def fit(self, cov_type="robust", debiased=True, **cov_config): """ Estimate model parameters @@ -193,7 +215,7 @@ def fit(self, cov_type='robust', debiased=True, **cov_config): Returns ------- - results : LinearFactorModelResults + LinearFactorModelResults Results class with parameter estimates, covariance and test statistics Notes @@ -221,41 +243,61 @@ def fit(self, cov_type='robust', debiased=True, **cov_config): nloading = (nfactor + 1) * nportfolio xpxi = np.eye(nloading + nfactor) - xpxi[:nloading, :nloading] = np.kron(np.eye(nportfolio), np.linalg.pinv(fc.T @ fc / nobs)) + xpxi[:nloading, :nloading] = np.kron( + np.eye(nportfolio), np.linalg.pinv(fc.T @ fc / nobs) + ) f_rep = np.tile(fc, (1, nportfolio)) eps_rep = np.tile(eps, (nfactor + 1, 1)) # 1 2 3 ... 25 1 2 3 ... - eps_rep = eps_rep.ravel(order='F') - eps_rep = np.reshape(eps_rep, (nobs, (nfactor + 1) * nportfolio), order='F') + eps_rep = eps_rep.ravel(order="F") + eps_rep = np.reshape(eps_rep, (nobs, (nfactor + 1) * nportfolio), order="F") xe = f_rep * eps_rep xe = np.c_[xe, fe] - if cov_type in ('robust', 'heteroskedastic'): - cov_est = HeteroskedasticCovariance(xe, inv_jacobian=xpxi, center=False, - debiased=debiased, df=fc.shape[1]) - rp_cov_est = HeteroskedasticCovariance(fe, jacobian=np.eye(f.shape[1]), center=False, - debiased=debiased, df=1) - elif cov_type == 'kernel': - cov_est = KernelCovariance(xe, inv_jacobian=xpxi, center=False, debiased=debiased, - df=fc.shape[1], **cov_config) + if cov_type in ("robust", "heteroskedastic"): + cov_est = HeteroskedasticCovariance( + xe, inv_jacobian=xpxi, center=False, debiased=debiased, df=fc.shape[1] + ) + rp_cov_est = HeteroskedasticCovariance( + fe, jacobian=np.eye(f.shape[1]), center=False, debiased=debiased, df=1 + ) + elif cov_type == "kernel": + cov_est = KernelCovariance( + xe, + inv_jacobian=xpxi, + center=False, + debiased=debiased, + df=fc.shape[1], + **cov_config + ) bw = cov_est.bandwidth _cov_config = {k: v for k, v in cov_config.items()} - _cov_config['bandwidth'] = bw - rp_cov_est = KernelCovariance(fe, jacobian=np.eye(f.shape[1]), center=False, - debiased=debiased, df=1, **_cov_config) + _cov_config["bandwidth"] = bw + rp_cov_est = KernelCovariance( + fe, + jacobian=np.eye(f.shape[1]), + center=False, + debiased=debiased, + df=1, + **_cov_config + ) else: - raise ValueError('Unknown cov_type: {0}'.format(cov_type)) + raise ValueError("Unknown cov_type: {0}".format(cov_type)) full_vcv = cov_est.cov rp_cov = rp_cov_est.cov vcv = full_vcv[:nloading, :nloading] # Rearrange VCV - order = np.reshape(np.arange((nfactor + 1) * nportfolio), (nportfolio, nfactor + 1)) + order = np.reshape( + np.arange((nfactor + 1) * nportfolio), (nportfolio, nfactor + 1) + ) order = order.T.ravel() vcv = vcv[order][:, order] # Return values alpha_vcv = vcv[:nportfolio, :nportfolio] stat = float(alphas.T @ np.linalg.pinv(alpha_vcv) @ alphas) - jstat = WaldTestStatistic(stat, 'All alphas are 0', nportfolio, name='J-statistic') + jstat = WaldTestStatistic( + stat, "All alphas are 0", nportfolio, name="J-statistic" + ) params = b.T betas = b[1:].T residual_ss = (eps ** 2).sum() @@ -264,19 +306,34 @@ def fit(self, cov_type='robust', debiased=True, **cov_config): r2 = 1 - residual_ss / total_ss param_names = [] for portfolio in self.portfolios.cols: - param_names.append('alpha-{0}'.format(portfolio)) + param_names.append("alpha-{0}".format(portfolio)) for factor in self.factors.cols: - param_names.append('beta-{0}-{1}'.format(portfolio, factor)) + param_names.append("beta-{0}-{1}".format(portfolio, factor)) for factor in self.factors.cols: - param_names.append('lambda-{0}'.format(factor)) - - res = AttrDict(params=params, cov=full_vcv, betas=betas, rp=rp, rp_cov=rp_cov, - alphas=alphas, alpha_vcv=alpha_vcv, jstat=jstat, - rsquared=r2, total_ss=total_ss, residual_ss=residual_ss, - param_names=param_names, portfolio_names=self.portfolios.cols, - factor_names=self.factors.cols, name=self._name, - cov_type=cov_type, model=self, nobs=nobs, rp_names=self.factors.cols, - cov_est=cov_est) + param_names.append("lambda-{0}".format(factor)) + + res = AttrDict( + params=params, + cov=full_vcv, + betas=betas, + rp=rp, + rp_cov=rp_cov, + alphas=alphas, + alpha_vcv=alpha_vcv, + jstat=jstat, + rsquared=r2, + total_ss=total_ss, + residual_ss=residual_ss, + param_names=param_names, + portfolio_names=self.portfolios.cols, + factor_names=self.factors.cols, + name=self._name, + cov_type=cov_type, + model=self, + nobs=nobs, + rp_names=self.factors.cols, + cov_est=cov_est, + ) return LinearFactorModelResults(res) @@ -286,15 +343,15 @@ class LinearFactorModel(TradedFactorModel): Parameters ---------- - portfolios : array-like + portfolios : array_like Test portfolio returns (nobs by nportfolio) - factors : array-like + factors : array_like Priced factor returns (nobs by nfactor) risk_free : bool, optional Flag indicating whether the risk-free rate should be estimated from returns along other risk premia. If False, the returns are assumed to be excess returns using the correct risk-free rate. - sigma : array-like, optional + sigma : array_like, optional Positive definite residual covariance (nportfolio by nportfolio) Notes @@ -335,7 +392,9 @@ def __init__(self, portfolios, factors, *, risk_free=False, sigma=None): super(LinearFactorModel, self).__init__(portfolios, factors) self._validate_additional_data() if sigma is None: - self._sigma_m12 = self._sigma_inv = self._sigma = np.eye(self.portfolios.shape[1]) + self._sigma_m12 = self._sigma_inv = self._sigma = np.eye( + self.portfolios.shape[1] + ) else: self._sigma = np.asarray(sigma) vals, vecs = np.linalg.eigh(sigma) @@ -345,23 +404,26 @@ def __init__(self, portfolios, factors, *, risk_free=False, sigma=None): def __str__(self): out = super(LinearFactorModel, self).__str__() if np.any(self._sigma != np.eye(self.portfolios.shape[1])): - out += ' using GLS' - out += '\nEstimated risk-free rate: {0}'.format(self._risk_free) + out += " using GLS" + out += "\nEstimated risk-free rate: {0}".format(self._risk_free) return out def _validate_additional_data(self): f = self.factors.ndarray p = self.portfolios.ndarray - nrp = (f.shape[1] + int(self._risk_free)) + nrp = f.shape[1] + int(self._risk_free) if p.shape[1] < nrp: - raise ValueError('The number of test portfolio must be at least as ' - 'large as the number of risk premia, including the ' - 'risk free rate if estimated.') + raise ValueError( + "The number of test portfolio must be at least as " + "large as the number of risk premia, including the " + "risk free rate if estimated." + ) @classmethod - def from_formula(cls, formula, data, *, portfolios=None, risk_free=False, - sigma=None): + def from_formula( + cls, formula, data, *, portfolios=None, risk_free=False, sigma=None + ): """ Parameters ---------- @@ -369,19 +431,19 @@ def from_formula(cls, formula, data, *, portfolios=None, risk_free=False, Patsy formula modified for the syntax described in the notes data : DataFrame DataFrame containing the variables used in the formula - portfolios : array-like, optional + portfolios : array_like, optional Portfolios to be used in the model. If provided, must use formula syntax containing only factors. risk_free : bool, optional Flag indicating whether the risk-free rate should be estimated from returns along other risk premia. If False, the returns are assumed to be excess returns using the correct risk-free rate. - sigma : array-like, optional + sigma : array_like, optional Positive definite residual covariance (nportfolio by nportfolio) Returns ------- - model : LinearFactorModel + LinearFactorModel Model instance Notes @@ -405,12 +467,14 @@ def from_formula(cls, formula, data, *, portfolios=None, risk_free=False, >>> formula = 'MktRF + SMB + HML' >>> mod = LinearFactorModel.from_formula(formula, data, portfolios=portfolios) """ - factors, portfolios, formula = cls._prepare_data_from_formula(formula, data, portfolios) + factors, portfolios, formula = cls._prepare_data_from_formula( + formula, data, portfolios + ) mod = cls(portfolios, factors, risk_free=risk_free, sigma=sigma) mod.formula = formula return mod - def fit(self, cov_type='robust', debiased=True, **cov_config): + def fit(self, cov_type="robust", debiased=True, **cov_config): """ Estimate model parameters @@ -426,7 +490,7 @@ def fit(self, cov_type='robust', debiased=True, **cov_config): Returns ------- - results : LinearFactorModelResults + LinearFactorModelResults Results class with parameter estimates, covariance and test statistics Notes @@ -461,21 +525,28 @@ def fit(self, cov_type='robust', debiased=True, **cov_config): # Jacobian jacobian = self._jacobian(betas, lam, alphas) - if cov_type not in ('robust', 'heteroskedastic', 'kernel'): - raise ValueError('Unknown weight: {0}'.format(cov_type)) - if cov_type in ('robust', 'heteroskedastic'): + if cov_type not in ("robust", "heteroskedastic", "kernel"): + raise ValueError("Unknown weight: {0}".format(cov_type)) + if cov_type in ("robust", "heteroskedastic"): cov_est = HeteroskedasticCovariance else: # 'kernel': cov_est = KernelCovariance - cov_est = cov_est(moments, jacobian=jacobian, center=False, - debiased=debiased, df=fc.shape[1], **cov_config) + cov_est = cov_est( + moments, + jacobian=jacobian, + center=False, + debiased=debiased, + df=fc.shape[1], + **cov_config + ) # VCV full_vcv = cov_est.cov alpha_vcv = full_vcv[s2:, s2:] stat = float(alphas.T @ np.linalg.pinv(alpha_vcv) @ alphas) - jstat = WaldTestStatistic(stat, 'All alphas are 0', nport - nf - nrf, - name='J-statistic') + jstat = WaldTestStatistic( + stat, "All alphas are 0", nport - nf - nrf, name="J-statistic" + ) total_ss = ((p - p.mean(0)[None, :]) ** 2).sum() residual_ss = (eps ** 2).sum() @@ -486,13 +557,13 @@ def fit(self, cov_type='robust', debiased=True, **cov_config): params = np.c_[alphas, betas] param_names = [] for portfolio in self.portfolios.cols: - param_names.append('alpha-{0}'.format(portfolio)) + param_names.append("alpha-{0}".format(portfolio)) for factor in self.factors.cols: - param_names.append('beta-{0}-{1}'.format(portfolio, factor)) + param_names.append("beta-{0}-{1}".format(portfolio, factor)) if not excess_returns: - param_names.append('lambda-risk_free') + param_names.append("lambda-risk_free") for factor in self.factors.cols: - param_names.append('lambda-{0}'.format(factor)) + param_names.append("lambda-{0}".format(factor)) # Pivot vcv to remove unnecessary and have correct order order = np.reshape(np.arange(s1), (nport, nf + 1)) @@ -503,14 +574,29 @@ def fit(self, cov_type='robust', debiased=True, **cov_config): factor_names = list(self.factors.cols) rp_names = factor_names[:] if not excess_returns: - rp_names.insert(0, 'risk_free') - res = AttrDict(params=params, cov=full_vcv, betas=betas, rp=rp, rp_cov=rp_cov, - alphas=alphas, alpha_vcv=alpha_vcv, jstat=jstat, - rsquared=r2, total_ss=total_ss, residual_ss=residual_ss, - param_names=param_names, portfolio_names=self.portfolios.cols, - factor_names=factor_names, name=self._name, - cov_type=cov_type, model=self, nobs=nobs, rp_names=rp_names, - cov_est=cov_est) + rp_names.insert(0, "risk_free") + res = AttrDict( + params=params, + cov=full_vcv, + betas=betas, + rp=rp, + rp_cov=rp_cov, + alphas=alphas, + alpha_vcv=alpha_vcv, + jstat=jstat, + rsquared=r2, + total_ss=total_ss, + residual_ss=residual_ss, + param_names=param_names, + portfolio_names=self.portfolios.cols, + factor_names=factor_names, + name=self._name, + cov_type=cov_type, + model=self, + nobs=nobs, + rp_names=rp_names, + cov_est=cov_est, + ) return LinearFactorModelResults(res) @@ -544,7 +630,7 @@ def _jacobian(self, betas, lam, alphas): block = np.zeros((nf + nrf, nf + 1)) block[:, 1:] = b_tilde[[i]].T @ _lam.T block[nrf:, 1:] -= alpha_tilde[i] * np.eye(nf) - jac[s1:s2, (i * (nf + 1)):((i + 1) * (nf + 1))] = block + jac[s1:s2, (i * (nf + 1)) : ((i + 1) * (nf + 1))] = block jac[s1:s2, s1:s2] = bc.T @ sigma_inv @ bc zero_lam = np.r_[[[0]], _lam] jac[s2:s3, :s1] = np.kron(np.eye(nport), zero_lam.T) @@ -575,9 +661,9 @@ class LinearFactorModelGMM(LinearFactorModel): Parameters ---------- - portfolios : array-like + portfolios : array_like Test portfolio returns (nobs by nportfolio) - factors : array-like + factors : array_like Priced factors values (nobs by nfactor) risk_free : bool, optional Flag indicating whether the risk-free rate should be estimated @@ -614,7 +700,9 @@ class LinearFactorModelGMM(LinearFactorModel): """ def __init__(self, factors, portfolios, *, risk_free=False): - super(LinearFactorModelGMM, self).__init__(factors, portfolios, risk_free=risk_free) + super(LinearFactorModelGMM, self).__init__( + factors, portfolios, risk_free=risk_free + ) @classmethod def from_formula(cls, formula, data, *, portfolios=None, risk_free=False): @@ -625,7 +713,7 @@ def from_formula(cls, formula, data, *, portfolios=None, risk_free=False): Patsy formula modified for the syntax described in the notes data : DataFrame DataFrame containing the variables used in the formula - portfolios : array-like, optional + portfolios : array_like, optional Portfolios to be used in the model. If provided, must use formula syntax containing only factors. risk_free : bool, optional @@ -635,7 +723,7 @@ def from_formula(cls, formula, data, *, portfolios=None, risk_free=False): Returns ------- - model : LinearFactorModelGMM + LinearFactorModelGMM Model instance Notes @@ -659,13 +747,24 @@ def from_formula(cls, formula, data, *, portfolios=None, risk_free=False): >>> formula = 'MktRF + SMB + HML' >>> mod = LinearFactorModel.from_formula(formula, data, portfolios=portfolios) """ - factors, portfolios, formula = cls._prepare_data_from_formula(formula, data, portfolios) + factors, portfolios, formula = cls._prepare_data_from_formula( + formula, data, portfolios + ) mod = cls(portfolios, factors, risk_free=risk_free) mod.formula = formula return mod - def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, - cov_type='robust', debiased=True, **cov_config): + def fit( + self, + center=True, + use_cue=False, + steps=2, + disp=10, + max_iter=1000, + cov_type="robust", + debiased=True, + **cov_config + ): """ Estimate model parameters @@ -695,7 +794,7 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, Returns ------- - results : GMMFactorModelResults + GMMFactorModelResults Results class with parameter estimates, covariance and test statistics Notes @@ -710,7 +809,9 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, excess_returns = not self._risk_free nrf = int(not bool(excess_returns)) # 1. Starting Values - use 2 pass - mod = LinearFactorModel(self.portfolios, self.factors, risk_free=self._risk_free) + mod = LinearFactorModel( + self.portfolios, self.factors, risk_free=self._risk_free + ) res = mod.fit() betas = np.asarray(res.betas).ravel() lam = np.asarray(res.risk_premia) @@ -718,9 +819,9 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, sv = np.r_[betas, lam, mu][:, None] g = self._moments(sv, excess_returns) g -= g.mean(0)[None, :] if center else 0 - if cov_type not in ('robust', 'heteroskedastic', 'kernel'): - raise ValueError('Unknown weight: {0}'.format(cov_type)) - if cov_type in ('robust', 'heteroskedastic'): + if cov_type not in ("robust", "heteroskedastic", "kernel"): + raise ValueError("Unknown weight: {0}".format(cov_type)) + if cov_type in ("robust", "heteroskedastic"): weight_est = HeteroskedasticWeight cov_est = HeteroskedasticCovariance else: # 'kernel': @@ -733,8 +834,13 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, # 2. Step 1 using w = inv(s) from SV callback = callback_factory(self._j, args, disp=disp) - res = minimize(self._j, sv, args=args, callback=callback, - options={'disp': bool(disp), 'maxiter': max_iter}) + res = minimize( + self._j, + sv, + args=args, + callback=callback, + options={"disp": bool(disp), "maxiter": max_iter}, + ) params = res.x last_obj = res.fun iters = 1 @@ -748,8 +854,13 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, # 2. Step 1 using w = inv(s) from SV callback = callback_factory(self._j, args, disp=disp) - res = minimize(self._j, params, args=args, callback=callback, - options={'disp': bool(disp), 'maxiter': max_iter}) + res = minimize( + self._j, + params, + args=args, + callback=callback, + options={"disp": bool(disp), "maxiter": max_iter}, + ) params = res.x obj = res.fun if np.abs(obj - last_obj) < 1e-6: @@ -760,8 +871,13 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, args = (excess_returns, weight_est) obj = self._j_cue callback = callback_factory(obj, args, disp=disp) - res = minimize(obj, params, args=args, callback=callback, - options={'disp': bool(disp), 'maxiter': max_iter}) + res = minimize( + obj, + params, + args=args, + callback=callback, + options={"disp": bool(disp), "maxiter": max_iter}, + ) params = res.x # 4. Compute final S and G for inference @@ -769,8 +885,14 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, s = g.T @ g / nobs jac = self._jacobian(params, excess_returns) - cov_est = cov_est(g, jacobian=jac, center=center, debiased=debiased, - df=self.factors.shape[1], **cov_config) + cov_est = cov_est( + g, + jacobian=jac, + center=center, + debiased=debiased, + df=self.factors.shape[1], + **cov_config + ) full_vcv = cov_est.cov sel = slice((n * k), (n * k + k + nrf)) @@ -780,10 +902,12 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, alphas = g.mean(0)[sel, None] alpha_vcv = s[sel, sel] / nobs stat = self._j(params, excess_returns, w) - jstat = WaldTestStatistic(stat, 'All alphas are 0', n - k - nrf, name='J-statistic') + jstat = WaldTestStatistic( + stat, "All alphas are 0", n - k - nrf, name="J-statistic" + ) # R2 calculation - betas = np.reshape(params[:(n * k)], (n, k)) + betas = np.reshape(params[: (n * k)], (n, k)) resids = self.portfolios.ndarray - self.factors.ndarray @ betas.T resids -= resids.mean(0)[None, :] residual_ss = (resids ** 2).sum() @@ -794,23 +918,39 @@ def fit(self, center=True, use_cue=False, steps=2, disp=10, max_iter=1000, param_names = [] for portfolio in self.portfolios.cols: for factor in self.factors.cols: - param_names.append('beta-{0}-{1}'.format(portfolio, factor)) + param_names.append("beta-{0}-{1}".format(portfolio, factor)) if not excess_returns: - param_names.append('lambda-risk_free') - param_names.extend(['lambda-{0}'.format(f) for f in self.factors.cols]) - param_names.extend(['mu-{0}'.format(f) for f in self.factors.cols]) + param_names.append("lambda-risk_free") + param_names.extend(["lambda-{0}".format(f) for f in self.factors.cols]) + param_names.extend(["mu-{0}".format(f) for f in self.factors.cols]) rp_names = list(self.factors.cols)[:] if not excess_returns: - rp_names.insert(0, 'risk_free') + rp_names.insert(0, "risk_free") params = np.c_[alphas, betas] # 5. Return values - res = AttrDict(params=params, cov=full_vcv, betas=betas, rp=rp, rp_cov=rp_cov, - alphas=alphas, alpha_vcv=alpha_vcv, jstat=jstat, - rsquared=r2, total_ss=total_ss, residual_ss=residual_ss, - param_names=param_names, portfolio_names=self.portfolios.cols, - factor_names=self.factors.cols, name=self._name, - cov_type=cov_type, model=self, nobs=nobs, rp_names=rp_names, - iter=iters, cov_est=cov_est) + res = AttrDict( + params=params, + cov=full_vcv, + betas=betas, + rp=rp, + rp_cov=rp_cov, + alphas=alphas, + alpha_vcv=alpha_vcv, + jstat=jstat, + rsquared=r2, + total_ss=total_ss, + residual_ss=residual_ss, + param_names=param_names, + portfolio_names=self.portfolios.cols, + factor_names=self.factors.cols, + name=self._name, + cov_type=cov_type, + model=self, + nobs=nobs, + rp_names=rp_names, + iter=iters, + cov_est=cov_est, + ) return GMMFactorModelResults(res) @@ -877,10 +1017,10 @@ def _jacobian(self, params, excess_returns): b = betas[[i]] else: b = np.c_[[1], betas[[i]]] - jac12[(i * (k + 1)):(i + 1) * (k + 1)] = f_aug.T @ (iota @ b) / nobs + jac12[(i * (k + 1)) : (i + 1) * (k + 1)] = f_aug.T @ (iota @ b) / nobs b = betas[[i]] - jac13[(i * (k + 1)):(i + 1) * (k + 1)] = -f_aug.T @ (iota @ b) / nobs + jac13[(i * (k + 1)) : (i + 1) * (k + 1)] = -f_aug.T @ (iota @ b) / nobs jac[:r1, s1:s2] = jac12 jac[:r1, s2:] = jac13 jac[-k:, -k:] = np.eye(k) diff --git a/linearmodels/asset_pricing/results.py b/linearmodels/asset_pricing/results.py index 5aac36354c..64b2dc8ae3 100644 --- a/linearmodels/asset_pricing/results.py +++ b/linearmodels/asset_pricing/results.py @@ -5,9 +5,9 @@ import datetime as dt -from property_cached import cached_property import numpy as np import pandas as pd +from property_cached import cached_property from scipy import stats from statsmodels.iolib.summary import SimpleTable, fmt_2cols, fmt_params @@ -23,6 +23,7 @@ class LinearFactorModelResults(_SummaryStr): results : dict[str, any] A dictionary of results from the model estimation. """ + def __init__(self, results): self._jstat = results.jstat self._params = results.params @@ -40,7 +41,7 @@ def __init__(self, results): self.model = results.model self._nobs = results.nobs self._datetime = dt.datetime.now() - self._cols = ['alpha'] + ['{0}'.format(f) for f in self._factor_names] + self._cols = ["alpha"] + ["{0}".format(f) for f in self._factor_names] self._rp_names = results.rp_names self._alpha_vcv = results.alpha_vcv self._cov_est = results.cov_est @@ -53,27 +54,31 @@ def summary(self): ``summary.as_html()`` and ``summary.as_latex()``. """ - title = self.name + ' Estimation Summary' + title = self.name + " Estimation Summary" - top_left = [('No. Test Portfolios:', len(self._portfolio_names)), - ('No. Factors:', len(self._factor_names)), - ('No. Observations:', self.nobs), - ('Date:', self._datetime.strftime('%a, %b %d %Y')), - ('Time:', self._datetime.strftime('%H:%M:%S')), - ('Cov. Estimator:', self._cov_type), - ('', '')] + top_left = [ + ("No. Test Portfolios:", len(self._portfolio_names)), + ("No. Factors:", len(self._factor_names)), + ("No. Observations:", self.nobs), + ("Date:", self._datetime.strftime("%a, %b %d %Y")), + ("Time:", self._datetime.strftime("%H:%M:%S")), + ("Cov. Estimator:", self._cov_type), + ("", ""), + ] j_stat = _str(self.j_statistic.stat) j_pval = pval_format(self.j_statistic.pval) j_dist = self.j_statistic.dist_name - top_right = [('R-squared:', _str(self.rsquared)), - ('J-statistic:', j_stat), - ('P-value', j_pval), - ('Distribution:', j_dist), - ('', ''), - ('', ''), - ('', '')] + top_right = [ + ("R-squared:", _str(self.rsquared)), + ("J-statistic:", j_stat), + ("P-value", j_pval), + ("Distribution:", j_dist), + ("", ""), + ("", ""), + ("", ""), + ] stubs = [] vals = [] @@ -87,9 +92,9 @@ def summary(self): # Top Table # Parameter table fmt = fmt_2cols - fmt['data_fmts'][1] = '%18s' + fmt["data_fmts"][1] = "%18s" - top_right = [('%-21s' % (' ' + k), v) for k, v in top_right] + top_right = [("%-21s" % (" " + k), v) for k, v in top_right] stubs = [] vals = [] for stub, val in top_right: @@ -103,11 +108,7 @@ def summary(self): tstats = np.asarray(self.risk_premia / self.risk_premia_se) pvalues = 2 - 2 * stats.norm.cdf(np.abs(tstats)) ci = rp + se * stats.norm.ppf([[0.025, 0.975]]) - param_data = np.c_[rp, - se, - tstats[:, None], - pvalues[:, None], - ci] + param_data = np.c_[rp, se, tstats[:, None], pvalues[:, None], ci] data = [] for row in param_data: txt_row = [] @@ -117,24 +118,26 @@ def summary(self): f = pval_format txt_row.append(f(v)) data.append(txt_row) - title = 'Risk Premia Estimates' + title = "Risk Premia Estimates" table_stubs = list(self.risk_premia.index) - header = ['Parameter', 'Std. Err.', 'T-stat', 'P-value', 'Lower CI', 'Upper CI'] - table = SimpleTable(data, - stubs=table_stubs, - txt_fmt=fmt_params, - headers=header, - title=title) + header = ["Parameter", "Std. Err.", "T-stat", "P-value", "Lower CI", "Upper CI"] + table = SimpleTable( + data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title + ) smry.tables.append(table) - smry.add_extra_txt(['Covariance estimator:', - str(self._cov_est), - 'See full_summary for complete results']) + smry.add_extra_txt( + [ + "Covariance estimator:", + str(self._cov_est), + "See full_summary for complete results", + ] + ) return smry @staticmethod def _single_table(params, se, name, param_names, first=False): - tstats = (params / se) + tstats = params / se pvalues = 2 - 2 * stats.norm.cdf(tstats) ci = params + se * stats.norm.ppf([[0.025, 0.975]]) param_data = np.c_[params, se, tstats, pvalues, ci] @@ -148,14 +151,22 @@ def _single_table(params, se, name, param_names, first=False): f = pval_format txt_row.append(f(v)) data.append(txt_row) - title = '{0} Coefficients'.format(name) + title = "{0} Coefficients".format(name) table_stubs = param_names if first: - header = ['Parameter', 'Std. Err.', 'T-stat', 'P-value', 'Lower CI', 'Upper CI'] + header = [ + "Parameter", + "Std. Err.", + "T-stat", + "P-value", + "Lower CI", + "Upper CI", + ] else: header = None - table = SimpleTable(data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, - title=title) + table = SimpleTable( + data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title + ) return table @@ -168,10 +179,16 @@ def full_summary(self): param_names = list(params.columns) first = True for row in params.index: - smry.tables.append(SimpleTable([''])) - smry.tables.append(self._single_table(np.asarray(params.loc[row])[:, None], - np.asarray(se.loc[row])[:, None], - row, param_names, first)) + smry.tables.append(SimpleTable([""])) + smry.tables.append( + self._single_table( + np.asarray(params.loc[row])[:, None], + np.asarray(se.loc[row])[:, None], + row, + param_names, + first, + ) + ) first = False return smry @@ -199,7 +216,9 @@ def betas(self): @property def params(self): """Estimated parameters""" - return pd.DataFrame(self._params, columns=self._cols, index=self._portfolio_names) + return pd.DataFrame( + self._params, columns=self._cols, index=self._portfolio_names + ) @property def std_errors(self): @@ -224,7 +243,9 @@ def cov_estimator(self): @property def cov(self): """Estimated covariance of parameters""" - return pd.DataFrame(self._cov, columns=self._param_names, index=self._param_names) + return pd.DataFrame( + self._cov, columns=self._param_names, index=self._param_names + ) @property def j_statistic(self): @@ -233,7 +254,7 @@ def j_statistic(self): Returns ------- - j : WaldTestStatistic + WaldTestStatistic Test statistic for null that model prices test portfolios Notes diff --git a/linearmodels/compat/numpy.py b/linearmodels/compat/numpy.py index 02c68a4eba..5792d29f64 100644 --- a/linearmodels/compat/numpy.py +++ b/linearmodels/compat/numpy.py @@ -2,7 +2,7 @@ import numpy as np -NP_LT_114 = LooseVersion(np.__version__) < LooseVersion('1.14') +NP_LT_114 = LooseVersion(np.__version__) < LooseVersion("1.14") def lstsq(a, b, rcond=None): @@ -14,4 +14,4 @@ def lstsq(a, b, rcond=None): return np.linalg.lstsq(a, b, rcond=rcond) -__all__ = ['lstsq'] +__all__ = ["lstsq"] diff --git a/linearmodels/compat/pandas.py b/linearmodels/compat/pandas.py index 8deabb8750..1d943bccc3 100644 --- a/linearmodels/compat/pandas.py +++ b/linearmodels/compat/pandas.py @@ -5,11 +5,20 @@ from linearmodels.typing import AnyPandas -PD_LT_023 = LooseVersion(pd.__version__) < LooseVersion('0.23') - -__all__ = ['is_string_dtype', 'is_numeric_dtype', 'is_categorical', - 'is_string_like', 'is_categorical_dtype', 'is_datetime64_any_dtype', - 'concat', 'get_codes', 'to_numpy', 'assert_series_equal'] +PD_LT_023 = LooseVersion(pd.__version__) < LooseVersion("0.23") + +__all__ = [ + "is_string_dtype", + "is_numeric_dtype", + "is_categorical", + "is_string_like", + "is_categorical_dtype", + "is_datetime64_any_dtype", + "concat", + "get_codes", + "to_numpy", + "assert_series_equal", +] try: from pandas.testing import assert_series_equal @@ -23,21 +32,25 @@ def concat(*args, **kwargs): See pandas.compat """ - if PD_LT_023 and 'sort' in kwargs: + if PD_LT_023 and "sort" in kwargs: kwargs = kwargs.copy() - del kwargs['sort'] + del kwargs["sort"] elif not PD_LT_023: - if 'sort' not in kwargs: + if "sort" not in kwargs: kwargs = kwargs.copy() - kwargs['sort'] = False + kwargs["sort"] = False return pd.concat(*args, **kwargs) try: - from pandas.api.types import (is_numeric_dtype, is_categorical, - is_string_dtype, is_categorical_dtype, - is_datetime64_any_dtype) + from pandas.api.types import ( + is_numeric_dtype, + is_categorical, + is_string_dtype, + is_categorical_dtype, + is_datetime64_any_dtype, + ) # From pandas 0.20.1 def is_string_like(obj): @@ -50,15 +63,21 @@ def is_string_like(obj): Returns ------- - is_str_like : bool + bool Whether `obj` is a string or not. """ return isinstance(obj, str) + except ImportError: # pragma: no cover - from pandas.core.common import (is_string_dtype, is_numeric_dtype, - is_categorical, is_categorical_dtype, - is_datetime64_any_dtype, is_string_like) + from pandas.core.common import ( + is_string_dtype, + is_numeric_dtype, + is_categorical, + is_categorical_dtype, + is_datetime64_any_dtype, + is_string_like, + ) def get_codes(index): diff --git a/linearmodels/compat/statsmodels.py b/linearmodels/compat/statsmodels.py index 6c91ab84eb..f939bf3117 100644 --- a/linearmodels/compat/statsmodels.py +++ b/linearmodels/compat/statsmodels.py @@ -8,10 +8,10 @@ def as_html(self): Returns ------- - html : string + str concatenated summary tables in HTML format """ - html = summary.summary_return(self.tables, return_fmt='html') + html = summary.summary_return(self.tables, return_fmt="html") if self.extra_txt is not None: - html = html + '

' + self.extra_txt.replace('\n', '
') + html = html + "

" + self.extra_txt.replace("\n", "
") return html diff --git a/linearmodels/conftest.py b/linearmodels/conftest.py index c6b08602c6..421b392797 100644 --- a/linearmodels/conftest.py +++ b/linearmodels/conftest.py @@ -2,15 +2,13 @@ def pytest_addoption(parser): - parser.addoption("--skip-slow", action="store_true", - help="skip slow tests") - parser.addoption("--only-slow", action="store_true", - help="run only slow tests") + parser.addoption("--skip-slow", action="store_true", help="skip slow tests") + parser.addoption("--only-slow", action="store_true", help="run only slow tests") def pytest_runtest_setup(item): - if 'slow' in item.keywords and item.config.getoption("--skip-slow"): # pragma: no cover - pytest.skip("skipping due to --skip-slow") # pragma: no cover + if "slow" in item.keywords and item.config.getoption("--skip-slow"): + pytest.skip("skipping due to --skip-slow") - if 'slow' not in item.keywords and item.config.getoption("--only-slow"): # pragma: no cover - pytest.skip("skipping due to --only-slow") # pragma: no cover + if "slow" not in item.keywords and item.config.getoption("--only-slow"): + pytest.skip("skipping due to --only-slow") diff --git a/linearmodels/datasets/__init__.py b/linearmodels/datasets/__init__.py index 76f82fb947..06a1c19514 100644 --- a/linearmodels/datasets/__init__.py +++ b/linearmodels/datasets/__init__.py @@ -8,4 +8,4 @@ def get_path(f): def load(module, file_name): - return pd.read_csv(join(get_path(module), file_name), compression='bz2') + return pd.read_csv(join(get_path(module), file_name), compression="bz2") diff --git a/linearmodels/datasets/birthweight/__init__.py b/linearmodels/datasets/birthweight/__init__.py index db57de8278..e148fa72eb 100644 --- a/linearmodels/datasets/birthweight/__init__.py +++ b/linearmodels/datasets/birthweight/__init__.py @@ -22,4 +22,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'birthweight.csv.bz2') + + return datasets.load(__file__, "birthweight.csv.bz2") diff --git a/linearmodels/datasets/card/__init__.py b/linearmodels/datasets/card/__init__.py index bcdbe5ea05..04cb809e66 100644 --- a/linearmodels/datasets/card/__init__.py +++ b/linearmodels/datasets/card/__init__.py @@ -43,4 +43,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'card.csv.bz2') + + return datasets.load(__file__, "card.csv.bz2") diff --git a/linearmodels/datasets/fertility/__init__.py b/linearmodels/datasets/fertility/__init__.py index 4af9ff36f0..dc0d0d58f3 100644 --- a/linearmodels/datasets/fertility/__init__.py +++ b/linearmodels/datasets/fertility/__init__.py @@ -34,4 +34,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'fertility.csv.bz2') + + return datasets.load(__file__, "fertility.csv.bz2") diff --git a/linearmodels/datasets/french/__init__.py b/linearmodels/datasets/french/__init__.py index af2489b7f6..d5f27e48be 100644 --- a/linearmodels/datasets/french/__init__.py +++ b/linearmodels/datasets/french/__init__.py @@ -45,6 +45,7 @@ def load(): from linearmodels import datasets - data = datasets.load(__file__, 'french.csv.bz2') - data['dates'] = pd.to_datetime(data.dates) + + data = datasets.load(__file__, "french.csv.bz2") + data["dates"] = pd.to_datetime(data.dates) return data diff --git a/linearmodels/datasets/fringe/__init__.py b/linearmodels/datasets/fringe/__init__.py index 2658da371a..0366e9387d 100644 --- a/linearmodels/datasets/fringe/__init__.py +++ b/linearmodels/datasets/fringe/__init__.py @@ -47,4 +47,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'fringe.csv.bz2') + + return datasets.load(__file__, "fringe.csv.bz2") diff --git a/linearmodels/datasets/jobtraining/__init__.py b/linearmodels/datasets/jobtraining/__init__.py index 895d595a5b..1fb5ec8617 100644 --- a/linearmodels/datasets/jobtraining/__init__.py +++ b/linearmodels/datasets/jobtraining/__init__.py @@ -38,4 +38,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'jobtraining.csv.bz2') + + return datasets.load(__file__, "jobtraining.csv.bz2") diff --git a/linearmodels/datasets/meps/__init__.py b/linearmodels/datasets/meps/__init__.py index 28892f05c5..9a620b5bde 100644 --- a/linearmodels/datasets/meps/__init__.py +++ b/linearmodels/datasets/meps/__init__.py @@ -33,4 +33,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'meps.csv.bz2') + + return datasets.load(__file__, "meps.csv.bz2") diff --git a/linearmodels/datasets/mroz/__init__.py b/linearmodels/datasets/mroz/__init__.py index 77a84006d3..b926d465f2 100644 --- a/linearmodels/datasets/mroz/__init__.py +++ b/linearmodels/datasets/mroz/__init__.py @@ -30,4 +30,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'mroz.csv.bz2') + + return datasets.load(__file__, "mroz.csv.bz2") diff --git a/linearmodels/datasets/munnell/__init__.py b/linearmodels/datasets/munnell/__init__.py index 42e97c1139..0973e167de 100644 --- a/linearmodels/datasets/munnell/__init__.py +++ b/linearmodels/datasets/munnell/__init__.py @@ -18,4 +18,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'munnell.csv.bz2') + + return datasets.load(__file__, "munnell.csv.bz2") diff --git a/linearmodels/datasets/wage/__init__.py b/linearmodels/datasets/wage/__init__.py index fdfb153e74..35ae7c835b 100644 --- a/linearmodels/datasets/wage/__init__.py +++ b/linearmodels/datasets/wage/__init__.py @@ -24,4 +24,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'wage.csv.bz2') + + return datasets.load(__file__, "wage.csv.bz2") diff --git a/linearmodels/datasets/wage_panel/__init__.py b/linearmodels/datasets/wage_panel/__init__.py index ac04937c7b..5057bd204a 100644 --- a/linearmodels/datasets/wage_panel/__init__.py +++ b/linearmodels/datasets/wage_panel/__init__.py @@ -20,4 +20,5 @@ def load(): from linearmodels import datasets - return datasets.load(__file__, 'wage_panel.csv.bz2') + + return datasets.load(__file__, "wage_panel.csv.bz2") diff --git a/linearmodels/formula.py b/linearmodels/formula.py index 984a02a8c1..26a0e7dd4d 100644 --- a/linearmodels/formula.py +++ b/linearmodels/formula.py @@ -3,9 +3,20 @@ PanelOLS, PooledOLS, RandomEffects) from linearmodels.system import IV3SLS, SUR, IVSystemGMM -__all__ = ['between_ols', 'random_effects', 'first_difference_ols', - 'pooled_ols', 'panel_ols', 'iv_2sls', 'iv_gmm', 'iv_gmm_cue', - 'iv_liml', 'sur', 'iv_3sls', 'iv_system_gmm'] +__all__ = [ + "between_ols", + "random_effects", + "first_difference_ols", + "pooled_ols", + "panel_ols", + "iv_2sls", + "iv_gmm", + "iv_gmm_cue", + "iv_liml", + "sur", + "iv_3sls", + "iv_system_gmm", +] iv_2sls = IV2SLS.from_formula iv_liml = IVLIML.from_formula diff --git a/linearmodels/iv/__init__.py b/linearmodels/iv/__init__.py index 70e7d4c750..3a89bfc8eb 100644 --- a/linearmodels/iv/__init__.py +++ b/linearmodels/iv/__init__.py @@ -2,5 +2,12 @@ from .model import IV2SLS, IVGMM, IVGMMCUE, IVLIML # flake8: noqa from .results import compare # flake8: noqa -__all__ = ['IV2SLS', 'IVGMM', 'IVGMMCUE', 'IVLIML', 'compare', - 'AbsorbingLS', 'Interaction'] +__all__ = [ + "IV2SLS", + "IVGMM", + "IVGMMCUE", + "IVLIML", + "compare", + "AbsorbingLS", + "Interaction", +] diff --git a/linearmodels/iv/_utility.py b/linearmodels/iv/_utility.py index 0da5867ba6..bdf27a6008 100644 --- a/linearmodels/iv/_utility.py +++ b/linearmodels/iv/_utility.py @@ -34,7 +34,7 @@ def proj(y: ndarray, x: ndarray) -> ndarray: Returns ------- - yhat : ndarray + ndarray Projected values of y (nobs by nseries) """ return x @ (np.linalg.pinv(x) @ y) @@ -53,7 +53,7 @@ def annihilate(y: ndarray, x: ndarray) -> ndarray: Returns ------- - eps : ndarray + ndarray Residuals values of y minus y projected on x (nobs by nseries) """ return y - proj(y, x) @@ -80,44 +80,52 @@ class IVFormulaParser(object): def __init__(self, formula: str, data: DataFrame, eval_env: int = 2): self._formula = formula self._data = data - self._na_action = NAAction(on_NA='raise', NA_types=[]) + self._na_action = NAAction(on_NA="raise", NA_types=[]) self._eval_env = eval_env self._components = {} # type: Dict[str, str] self._parse() def _parse(self): - blocks = self._formula.strip().split('~') + blocks = self._formula.strip().split("~") if len(blocks) == 2: dep = blocks[0].strip() exog = blocks[1].strip() - endog = '0' - instr = '0' + endog = "0" + instr = "0" elif len(blocks) == 3: blocks = [bl.strip() for bl in blocks] - if '[' not in blocks[1] or ']' not in blocks[2]: - raise ValueError('formula not understood. Endogenous variables and ' - 'instruments must be segregated in a block that ' - 'starts with [ and ends with ].') + if "[" not in blocks[1] or "]" not in blocks[2]: + raise ValueError( + "formula not understood. Endogenous variables and " + "instruments must be segregated in a block that " + "starts with [ and ends with ]." + ) dep = blocks[0].strip() - exog, endog = [bl.strip() for bl in blocks[1].split('[')] - instr, exog2 = [bl.strip() for bl in blocks[2].split(']')] - if endog[0] == '+' or endog[-1] == '+': - raise ValueError('endogenous block must not start or end with +. This block ' - 'was: {0}'.format(endog)) - if instr[0] == '+' or instr[-1] == '+': - raise ValueError('instrument block must not start or end with +. This ' - 'block was: {0}'.format(instr)) + exog, endog = [bl.strip() for bl in blocks[1].split("[")] + instr, exog2 = [bl.strip() for bl in blocks[2].split("]")] + if endog[0] == "+" or endog[-1] == "+": + raise ValueError( + "endogenous block must not start or end with +. This block " + "was: {0}".format(endog) + ) + if instr[0] == "+" or instr[-1] == "+": + raise ValueError( + "instrument block must not start or end with +. This " + "block was: {0}".format(instr) + ) if exog2: exog += exog2 if exog: - exog = exog[:-1].strip() if exog[-1] == '+' else exog - exog = '0' if not exog else '0 + ' + exog + exog = exog[:-1].strip() if exog[-1] == "+" else exog + exog = "0" if not exog else "0 + " + exog else: - raise ValueError('formula contains more then 2 separators (~)') - comp = {'dependent': '0 + ' + dep, - 'exog': exog, - 'endog': endog, - 'instruments': instr} + raise ValueError("formula contains more then 2 separators (~)") + comp = { + "dependent": "0 + " + dep, + "exog": exog, + "endog": endog, + "instruments": instr, + } self._components = comp @property @@ -140,33 +148,53 @@ def data(self) -> Tuple[OptionalDataFrame, ...]: @property def dependent(self) -> DataFrame: """Dependent variable""" - dep = self.components['dependent'] - dep = dmatrix('0 + ' + dep, self._data, eval_env=self._eval_env, - return_type='dataframe', NA_action=self._na_action) + dep = self.components["dependent"] + dep = dmatrix( + "0 + " + dep, + self._data, + eval_env=self._eval_env, + return_type="dataframe", + NA_action=self._na_action, + ) return dep @property def exog(self) -> OptionalDataFrame: """Exogenous variables""" - exog = self.components['exog'] - exog = dmatrix(exog, self._data, eval_env=self._eval_env, - return_type='dataframe', NA_action=self._na_action) + exog = self.components["exog"] + exog = dmatrix( + exog, + self._data, + eval_env=self._eval_env, + return_type="dataframe", + NA_action=self._na_action, + ) return self._empty_check(exog) @property def endog(self) -> OptionalDataFrame: """Endogenous variables""" - endog = self.components['endog'] - endog = dmatrix('0 + ' + endog, self._data, eval_env=self._eval_env, - return_type='dataframe', NA_action=self._na_action) + endog = self.components["endog"] + endog = dmatrix( + "0 + " + endog, + self._data, + eval_env=self._eval_env, + return_type="dataframe", + NA_action=self._na_action, + ) return self._empty_check(endog) @property def instruments(self) -> OptionalDataFrame: """Instruments""" - instr = self.components['instruments'] - instr = dmatrix('0 + ' + instr, self._data, eval_env=self._eval_env, - return_type='dataframe', NA_action=self._na_action) + instr = self.components["instruments"] + instr = dmatrix( + "0 + " + instr, + self._data, + eval_env=self._eval_env, + return_type="dataframe", + NA_action=self._na_action, + ) return self._empty_check(instr) diff --git a/linearmodels/iv/absorbing.py b/linearmodels/iv/absorbing.py index 2f95f13ca0..4c236a911c 100644 --- a/linearmodels/iv/absorbing.py +++ b/linearmodels/iv/absorbing.py @@ -27,7 +27,7 @@ except ImportError: from hashlib import sha1 as hash_func -SCALAR_DTYPES = {'int8': int8, 'int16': int16, 'int32': int32, 'int64': int64} +SCALAR_DTYPES = {"int8": int8, "int16": int16, "int32": int32, "int64": int64} _VARIABLE_CACHE = defaultdict(dict) # type: DefaultDict[Hashable, Dict[str, ndarray]] @@ -45,8 +45,9 @@ def clear_cache(): _VARIABLE_CACHE.clear() -def lsmr_annihilate(x: csc_matrix, y: ndarray, use_cache: bool = True, x_hash=None, - **lsmr_options) -> ndarray: +def lsmr_annihilate( + x: csc_matrix, y: ndarray, use_cache: bool = True, x_hash=None, **lsmr_options +) -> ndarray: r""" Removes projection of x on y from y @@ -66,7 +67,7 @@ def lsmr_annihilate(x: csc_matrix, y: ndarray, use_cache: bool = True, x_hash=No Returns ------- - resids : ndarray + ndarray Returns the residuals from regressing y on x, (nobs, nvar) Notes @@ -81,14 +82,14 @@ def lsmr_annihilate(x: csc_matrix, y: ndarray, use_cache: bool = True, x_hash=No """ use_cache = use_cache and x_hash is not None - regressor_hash = x_hash if x_hash is not None else '' + regressor_hash = x_hash if x_hash is not None else "" default_opts = dict(atol=1e-8, btol=1e-8, show=False) default_opts.update(lsmr_options) resids = [] for i in range(y.shape[1]): - _y = y[:, i:i + 1] + _y = y[:, i : i + 1] - variable_digest = '' + variable_digest = "" if use_cache: hasher = hash_func() hasher.update(ascontiguousarray(_y.data)) @@ -98,7 +99,7 @@ def lsmr_annihilate(x: csc_matrix, y: ndarray, use_cache: bool = True, x_hash=No resid = _VARIABLE_CACHE[regressor_hash][variable_digest] else: beta = lsmr(x, _y, **default_opts)[0] - resid = y[:, i:i + 1] - (x.dot(csc_matrix(beta[:, None]))).A + resid = y[:, i : i + 1] - (x.dot(csc_matrix(beta[:, None]))).A _VARIABLE_CACHE[regressor_hash][variable_digest] = resid resids.append(resid) if resids: @@ -119,7 +120,7 @@ def category_product(cats: AnyPandas) -> Series: Returns ------- - cp : Series + Series Categorical series containing the cartesian product of the categories in cats """ @@ -129,7 +130,7 @@ def category_product(cats: AnyPandas) -> Series: sizes = [] for c in cats: if not is_categorical(cats[c]): - raise TypeError('cats must contain only categorical variables') + raise TypeError("cats must contain only categorical variables") col = cats[c] max_code = get_codes(col.cat).max() size = 1 @@ -139,14 +140,18 @@ def category_product(cats: AnyPandas) -> Series: nobs = cats.shape[0] total_size = sum(sizes) if total_size >= 63: - raise ValueError('There are too many cats with too many states to use this method.') + raise ValueError( + "There are too many cats with too many states to use this method." + ) dtype_size = min(filter(lambda v: total_size < (v - 1), (8, 16, 32, 64))) - dtype_str = 'int{0:d}'.format(dtype_size) + dtype_str = "int{0:d}".format(dtype_size) dtype_val = dtype(dtype_str) codes = zeros(nobs, dtype=dtype_val) cum_size = 0 for i, col in enumerate(cats): - codes += (get_codes(cats[col].cat).astype(dtype_val) << SCALAR_DTYPES[dtype_str](cum_size)) + codes += get_codes(cats[col].cat).astype(dtype_val) << SCALAR_DTYPES[dtype_str]( + cum_size + ) cum_size += sizes[i] return Series(Categorical(codes), index=cats.index) @@ -162,15 +167,16 @@ def category_interaction(cat: Series, precondition: bool = True) -> csc_matrix: Returns ------- - dummies : csc_matrix + csc_matrix Sparse matrix of dummies with unit column norm """ codes = get_codes(category_product(cat).cat) return dummy_matrix(codes[:, None], precondition=precondition)[0] -def category_continuous_interaction(cat: AnyPandas, cont: AnyPandas, - precondition: bool = True) -> csc_matrix: +def category_continuous_interaction( + cat: AnyPandas, cont: AnyPandas, precondition: bool = True +) -> csc_matrix: """ Parameters ---------- @@ -183,7 +189,7 @@ def category_continuous_interaction(cat: AnyPandas, cont: AnyPandas, Returns ------- - interact : csc_matrix + csc_matrix Sparse matrix of dummy interactions with unit column norm """ codes = get_codes(category_product(cat).cat) @@ -240,10 +246,15 @@ class Interaction(object): >>> interact.sparse.shape # Cart product of all cats, 5**4, times ncont, 3 (100000, 1875) """ - _iv_data = IVData(None, 'none', 1) - def __init__(self, cat: OptionalArrayLike = None, cont: OptionalArrayLike = None, - nobs: int = None): + _iv_data = IVData(None, "none", 1) + + def __init__( + self, + cat: OptionalArrayLike = None, + cont: OptionalArrayLike = None, + nobs: int = None, + ): self._cat = cat self._cont = cont self._cat_data = self._iv_data @@ -257,26 +268,28 @@ def nobs(self): def _check_data(self): cat, cont = self._cat, self._cont - cat_nobs = getattr(cat, 'shape', (0,))[0] - cont_nobs = getattr(cont, 'shape', (0,))[0] + cat_nobs = getattr(cat, "shape", (0,))[0] + cont_nobs = getattr(cont, "shape", (0,))[0] nobs = max(cat_nobs, cont_nobs) if cat is None and cont is None: if self._nobs is not None: - self._cont_data = self._cat_data = IVData(None, 'none', nobs=self._nobs) + self._cont_data = self._cat_data = IVData(None, "none", nobs=self._nobs) else: - raise ValueError('nobs must be provided when cat and cont are None') + raise ValueError("nobs must be provided when cat and cont are None") return self._nobs = nobs - self._cat_data = IVData(cat, 'cat', nobs=nobs, convert_dummies=False) - self._cont_data = IVData(cont, 'cont', nobs=nobs, convert_dummies=False) + self._cat_data = IVData(cat, "cat", nobs=nobs, convert_dummies=False) + self._cont_data = IVData(cont, "cont", nobs=nobs, convert_dummies=False) if self._cat_data.shape[1] == self._cont_data.shape[1] == 0: - raise ValueError('Both cat and cont are empty arrays') + raise ValueError("Both cat and cont are empty arrays") cat_data = self._cat_data.pandas convert = [col for col in cat_data if not (is_categorical(cat_data[col]))] if convert: - cat_data = DataFrame({col: cat_data[col].astype('category') for col in cat_data}) - self._cat_data = IVData(cat_data, 'cat', convert_dummies=False) + cat_data = DataFrame( + {col: cat_data[col].astype("category") for col in cat_data} + ) + self._cat_data = IVData(cat_data, "cat", convert_dummies=False) @property def cat(self) -> DataFrame: @@ -303,7 +316,7 @@ def sparse(self) -> csc_matrix: Returns ------- - dummy_interact : csc_matrix + csc_matrix Dummy interaction constructed from the cartesian product of the categories and each of the continuous variables. @@ -320,9 +333,12 @@ def sparse(self) -> csc_matrix: if self.cat.shape[1] and self.cont.shape[1]: out = [] for col in self.cont: - out.append(category_continuous_interaction(self.cat, self.cont[col], - precondition=False)) - return sp.hstack(out, format='csc') + out.append( + category_continuous_interaction( + self.cat, self.cont[col], precondition=False + ) + ) + return sp.hstack(out, format="csc") elif self.cat.shape[1]: return category_interaction(category_product(self.cat), precondition=False) elif self.cont.shape[1]: @@ -340,7 +356,9 @@ def hash(self): cat_hashes = [] cat = self.cat for col in cat: - hasher.update(ascontiguousarray(to_numpy(get_codes(self.cat[col].cat)).data)) + hasher.update( + ascontiguousarray(to_numpy(get_codes(self.cat[col].cat)).data) + ) cat_hashes.append(hasher.hexdigest()) hasher = _reset(hasher) cat_hashes = tuple(sorted(cat_hashes)) @@ -355,7 +373,7 @@ def hash(self): return sorted(hashes) @staticmethod - def from_frame(frame: DataFrame) -> 'Interaction': + def from_frame(frame: DataFrame) -> "Interaction": """ Convenience function the simplifies using a DataFrame @@ -368,7 +386,7 @@ def from_frame(frame: DataFrame) -> 'Interaction': Returns ------- - interaction : Interaction + Interaction Instance using the columns of frame Examples @@ -412,9 +430,14 @@ class AbsorbingRegressor(object): Weights, if any """ - def __init__(self, *, cat: DataFrame = None, cont: DataFrame = None, - interactions: List[Interaction] = None, - weights: ndarray = None): + def __init__( + self, + *, + cat: DataFrame = None, + cont: DataFrame = None, + interactions: List[Interaction] = None, + weights: ndarray = None + ): self._cat = cat self._cont = cont self._interactions = interactions @@ -438,7 +461,9 @@ def hash(self): hasher = hash_func() if self._cat is not None: for col in self._cat: - hasher.update(ascontiguousarray(to_numpy(get_codes(self._cat[col].cat)).data)) + hasher.update( + ascontiguousarray(to_numpy(get_codes(self._cat[col].cat)).data) + ) hashes.append((hasher.hexdigest(),)) hasher = _reset(hasher) if self._cont is not None: @@ -471,11 +496,13 @@ def _regressors(self) -> csc_matrix: regressors.extend([interact.sparse for interact in self._interactions]) if regressors: - regressor_mat = sp.hstack(regressors, format='csc') + regressor_mat = sp.hstack(regressors, format="csc") approx_rank = regressor_mat.shape[1] self._approx_rank = approx_rank if self._weights is not None: - return (sp.diags(sqrt(self._weights.squeeze())).dot(regressor_mat)).asformat('csc') + return ( + sp.diags(sqrt(self._weights.squeeze())).dot(regressor_mat) + ).asformat("csc") return regressor_mat else: self._approx_rank = 0 @@ -565,14 +592,19 @@ class AbsorbingLS(object): >>> mod = AbsorbingLS(dep, exog, absorb=absorb, interactions=iaction) """ - def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike = None, *, - absorb: InteractionVar = None, - interactions: Union[InteractionVar, Iterable[InteractionVar]] = None, - weights: OptionalArrayLike = None): - - self._dependent = IVData(dependent, 'dependent') + def __init__( + self, + dependent: ArrayLike, + exog: OptionalArrayLike = None, + *, + absorb: InteractionVar = None, + interactions: Union[InteractionVar, Iterable[InteractionVar]] = None, + weights: OptionalArrayLike = None + ): + + self._dependent = IVData(dependent, "dependent") self._nobs = nobs = self._dependent.shape[0] - self._exog = IVData(exog, 'exog', nobs=self._nobs) + self._exog = IVData(exog, "exog", nobs=self._nobs) self._absorb = absorb if isinstance(absorb, DataFrame): self._absorb_inter = Interaction.from_frame(absorb) @@ -581,7 +613,7 @@ def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike = None, *, elif isinstance(absorb, Interaction): self._absorb_inter = absorb else: - raise TypeError('absorb must ba a DataFrame or an Interaction') + raise TypeError("absorb must ba a DataFrame or an Interaction") self._weights = weights self._is_weighted = False self._check_weights() @@ -598,7 +630,7 @@ def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike = None, *, self._drop_locs = self._drop_missing() self._columns = self._exog.cols self._index = self._dependent.rows - self._method = 'Absorbing LS' + self._method = "Absorbing LS" self._const_col = 0 self._has_constant = False @@ -634,22 +666,26 @@ def _check_weights(self): if self._weights is None: nobs = self._dependent.shape[0] self._is_weighted = False - self._weight_data = IVData(ones(nobs), 'weights') + self._weight_data = IVData(ones(nobs), "weights") else: self._is_weighted = True weights = IVData(self._weights).ndarray weights = weights / nanmean(weights) - self._weight_data = IVData(weights, var_name='weights', nobs=self._nobs) + self._weight_data = IVData(weights, var_name="weights", nobs=self._nobs) def _check_shape(self): nobs = self._nobs if self._absorb is not None: if self._absorb_inter.nobs != nobs: - raise ValueError('absorb and dependent have different number of observations') + raise ValueError( + "absorb and dependent have different number of observations" + ) for interact in self._interaction_list: if interact.nobs != nobs: - raise ValueError('interactions ({0}) and dependent have different number of ' - 'observations'.format(str(interact))) + raise ValueError( + "interactions ({0}) and dependent have different number of " + "observations".format(str(interact)) + ) @property def absorbed_dependent(self) -> DataFrame: @@ -658,7 +694,7 @@ def absorbed_dependent(self) -> DataFrame: Returns ------- - dependent : DataFrame + DataFrame Dependent after effects have been absorbed Raises @@ -668,7 +704,9 @@ def absorbed_dependent(self) -> DataFrame: """ if self._absorbed_dependent is not None: return self._absorbed_dependent - raise RuntimeError('fit must be called once before absorbed_dependent is available') + raise RuntimeError( + "fit must be called once before absorbed_dependent is available" + ) @property def absorbed_exog(self) -> DataFrame: @@ -677,7 +715,7 @@ def absorbed_exog(self) -> DataFrame: Returns ------- - exogenous : DataFrame + DataFrame Exogenous after effects have been absorbed Raises @@ -687,7 +725,7 @@ def absorbed_exog(self) -> DataFrame: """ if self._absorbed_exog is not None: return self._absorbed_exog - raise RuntimeError('fit must be called once before absorbed_exog is available') + raise RuntimeError("fit must be called once before absorbed_exog is available") @property def weights(self): @@ -707,7 +745,7 @@ def has_constant(self): @property def instruments(self): - return IVData(None, 'instrument', nobs=self._dependent.shape[0]) + return IVData(None, "instrument", nobs=self._dependent.shape[0]) def _prepare_interactions(self): if self._interactions is None: @@ -723,13 +761,19 @@ def _prepare_interactions(self): elif isinstance(interact, Interaction): self._interaction_list.append(interact) else: - raise TypeError('interactions must contain DataFrames or Interactions') + raise TypeError( + "interactions must contain DataFrames or Interactions" + ) def _first_time_fit(self, use_cache, lsmr_options): weights = self.weights.ndarray if self._is_weighted else None - areg = AbsorbingRegressor(cat=self._absorb_inter.cat, cont=self._absorb_inter.cont, - interactions=self._interaction_list, weights=weights) + areg = AbsorbingRegressor( + cat=self._absorb_inter.cat, + cont=self._absorb_inter.cont, + interactions=self._interaction_list, + weights=weights, + ) areg_constant = areg.has_constant self._regressors = preconditioner(areg.regressors)[0] self._num_params += areg.approx_rank @@ -751,10 +795,12 @@ def _first_time_fit(self, use_cache, lsmr_options): lsmr_options = {} if lsmr_options is None else lsmr_options if self._regressors.shape[1] > 0: - dep_resid = lsmr_annihilate(self._regressors, dep, use_cache, self._regressors_hash, - **lsmr_options) - exog_resid = lsmr_annihilate(self._regressors, exog, use_cache, - self._regressors_hash, **lsmr_options) + dep_resid = lsmr_annihilate( + self._regressors, dep, use_cache, self._regressors_hash, **lsmr_options + ) + exog_resid = lsmr_annihilate( + self._regressors, exog, use_cache, self._regressors_hash, **lsmr_options + ) else: dep_resid = dep exog_resid = exog @@ -763,13 +809,24 @@ def _first_time_fit(self, use_cache, lsmr_options): dep_resid += root_w * mu_dep exog_resid += root_w * mu_exog - self._absorbed_dependent = DataFrame(dep_resid, index=self._dependent.pandas.index, - columns=self._dependent.pandas.columns) - self._absorbed_exog = DataFrame(exog_resid, index=self._exog.pandas.index, - columns=self._exog.pandas.columns) - - def fit(self, *, cov_type: str = 'robust', debiased: bool = False, lsmr_options: dict = None, - use_cache: bool = True, **cov_config: Any): + self._absorbed_dependent = DataFrame( + dep_resid, + index=self._dependent.pandas.index, + columns=self._dependent.pandas.columns, + ) + self._absorbed_exog = DataFrame( + exog_resid, index=self._exog.pandas.index, columns=self._exog.pandas.columns + ) + + def fit( + self, + *, + cov_type: str = "robust", + debiased: bool = False, + lsmr_options: dict = None, + use_cache: bool = True, + **cov_config: Any + ): """ Estimate model parameters @@ -805,7 +862,7 @@ def fit(self, *, cov_type: str = 'robust', debiased: bool = False, lsmr_options: Returns ------- - results : AbsorbingLSResults + AbsorbingLSResults Results container Notes @@ -843,18 +900,19 @@ def fit(self, *, cov_type: str = 'robust', debiased: bool = False, lsmr_options: self._num_params += exog_resid.shape[1] cov_estimator = COVARIANCE_ESTIMATORS[cov_type] - cov_config['debiased'] = debiased - cov_config['kappa'] = 0.0 + cov_config["debiased"] = debiased + cov_config["kappa"] = 0.0 cov_config_copy = {k: v for k, v in cov_config.items()} - if 'center' in cov_config_copy: - del cov_config_copy['center'] - cov_estimator = cov_estimator(exog_resid, dep_resid, exog_resid, params, **cov_config_copy) + if "center" in cov_config_copy: + del cov_config_copy["center"] + cov_estimator = cov_estimator( + exog_resid, dep_resid, exog_resid, params, **cov_config_copy + ) - results = {'kappa': 0.0, - 'liml_kappa': 0.0} + results = {"kappa": 0.0, "liml_kappa": 0.0} pe = self._post_estimation(params, cov_estimator, cov_type) results.update(pe) - results['df_model'] = self._num_params + results["df_model"] = self._num_params return AbsorbingLSResults(results, self) @@ -869,7 +927,7 @@ def resids(self, params: ndarray): Returns ------- - resids : ndarray + ndarray Model residuals """ resids = self.wresids(params) @@ -886,7 +944,7 @@ def wresids(self, params: ndarray): Returns ------- - wresids : ndarray + ndarray Weighted model residuals Notes @@ -894,7 +952,9 @@ def wresids(self, params: ndarray): Uses weighted versions of data instead of raw data. Identical to resids if all weights are unity. """ - return to_numpy(self._absorbed_dependent) - to_numpy(self._absorbed_exog) @ params + return ( + to_numpy(self._absorbed_dependent) - to_numpy(self._absorbed_exog) @ params + ) def _f_statistic(self, params: ndarray, cov: ndarray, debiased: bool): const_loc = find_constant(self._exog.ndarray) @@ -906,10 +966,16 @@ def _post_estimation(self, params: ndarray, cov_estimator, cov_type: str): columns = self._columns index = self._index eps = self.resids(params) - fitted = DataFrame(self._dependent.ndarray - eps, index=self._dependent.rows, - columns=['fitted_values']) - absorbed_effects = DataFrame(to_numpy(self._absorbed_dependent) - to_numpy(fitted), - columns=['absorbed_effects'], index=self._dependent.rows) + fitted = DataFrame( + self._dependent.ndarray - eps, + index=self._dependent.rows, + columns=["fitted_values"], + ) + absorbed_effects = DataFrame( + to_numpy(self._absorbed_dependent) - to_numpy(fitted), + columns=["absorbed_effects"], + index=self._dependent.rows, + ) weps = self.wresids(params) cov = cov_estimator.cov @@ -930,7 +996,7 @@ def _post_estimation(self, params: ndarray, cov_estimator, cov_type: str): # If absorbing contains a constant, but exog does not, no need to demean if self._const_col is not None: col = self._const_col - x = to_numpy(self._absorbed_exog)[:, col:col + 1] + x = to_numpy(self._absorbed_exog)[:, col : col + 1] mu = (lstsq(x, to_numpy(e))[0]).squeeze() e = e - x * mu @@ -938,25 +1004,27 @@ def _post_estimation(self, params: ndarray, cov_estimator, cov_type: str): r2_absorbed = max(1 - residual_ss / aborbed_total_ss, 0.0) fstat = self._f_statistic(params, cov, debiased) - out = {'params': Series(params.squeeze(), columns, name='parameter'), - 'eps': Series(eps.squeeze(), index=index, name='residual'), - 'weps': Series(weps.squeeze(), index=index, name='weighted residual'), - 'cov': DataFrame(cov, columns=columns, index=columns), - 's2': float(cov_estimator.s2), - 'debiased': debiased, - 'residual_ss': float(residual_ss), - 'total_ss': float(total_ss), - 'r2': float(r2), - 'fstat': fstat, - 'vars': columns, - 'instruments': [], - 'cov_config': cov_estimator.config, - 'cov_type': cov_type, - 'method': self._method, - 'cov_estimator': cov_estimator, - 'fitted': fitted, - 'original_index': self._original_index, - 'absorbed_effects': absorbed_effects, - 'absorbed_r2': r2_absorbed} + out = { + "params": Series(params.squeeze(), columns, name="parameter"), + "eps": Series(eps.squeeze(), index=index, name="residual"), + "weps": Series(weps.squeeze(), index=index, name="weighted residual"), + "cov": DataFrame(cov, columns=columns, index=columns), + "s2": float(cov_estimator.s2), + "debiased": debiased, + "residual_ss": float(residual_ss), + "total_ss": float(total_ss), + "r2": float(r2), + "fstat": fstat, + "vars": columns, + "instruments": [], + "cov_config": cov_estimator.config, + "cov_type": cov_type, + "method": self._method, + "cov_estimator": cov_estimator, + "fitted": fitted, + "original_index": self._original_index, + "absorbed_effects": absorbed_effects, + "absorbed_r2": r2_absorbed, + } return out diff --git a/linearmodels/iv/common.py b/linearmodels/iv/common.py index 3624a040f9..51b3ac5acf 100644 --- a/linearmodels/iv/common.py +++ b/linearmodels/iv/common.py @@ -23,8 +23,9 @@ def find_constant(x): return None -def f_statistic(params: ndarray, cov: ndarray, debiased: bool, resid_df: int, - const_loc: int = None): +def f_statistic( + params: ndarray, cov: ndarray, debiased: bool, resid_df: int, const_loc: int = None +): """ Parameters ---------- @@ -42,18 +43,20 @@ def f_statistic(params: ndarray, cov: ndarray, debiased: bool, resid_df: int, Returns ------- - f_stat : WaldTestStatistic + WaldTestStatistic WaldTestStatistic instance """ - null = 'All parameters ex. constant are zero' - name = 'Model F-statistic' + null = "All parameters ex. constant are zero" + name = "Model F-statistic" nvar = params.shape[0] non_const = list(range(nvar)) if const_loc is not None: non_const.pop(const_loc) if not non_const: - return InvalidTestStatistic('Model contains no non-constant exogenous terms', name=name) + return InvalidTestStatistic( + "Model contains no non-constant exogenous terms", name=name + ) test_params = params[non_const] test_cov = cov[ix_(non_const, non_const)] test_stat = test_params.T @ inv(test_cov) @ test_params diff --git a/linearmodels/iv/covariance.py b/linearmodels/iv/covariance.py index fa049610ab..f9c3b0f113 100644 --- a/linearmodels/iv/covariance.py +++ b/linearmodels/iv/covariance.py @@ -10,10 +10,12 @@ from linearmodels.typing import Numeric, OptionalNumeric -KernelWeight = Union[Callable[[int, float], ndarray], - Callable[[float, float], ndarray], - Callable[[int, VarArg(Any)], ndarray], - Callable[[Numeric, int], Any]] +KernelWeight = Union[ + Callable[[int, float], ndarray], + Callable[[float, float], ndarray], + Callable[[int, VarArg(Any)], ndarray], + Callable[[Numeric, int], Any], +] CLUSTER_ERR = """ clusters has the wrong nobs. Expected {0}, got {1}. Any missing observation @@ -36,7 +38,7 @@ def _cov_cluster(z: ndarray, clusters: ndarray) -> ndarray: Returns ------- - c : ndarray + ndarray k by k cluster asymptotic covariance """ @@ -71,14 +73,16 @@ def _cov_kernel(z: ndarray, w: ndarray) -> ndarray: Returns ------- - c : ndarray + ndarray k by k kernel asymptotic covariance """ k = len(w) n = z.shape[0] if k > n: - raise ValueError('Length of w ({0}) is larger than the number ' - 'of elements in z ({1})'.format(k, n)) + raise ValueError( + "Length of w ({0}) is larger than the number " + "of elements in z ({1})".format(k, n) + ) s = z.T @ z for i in range(1, len(w)): op = z[i:].T @ z[:-i] @@ -99,7 +103,7 @@ def kernel_weight_bartlett(bw: int, *args) -> ndarray: Returns ------- - weights : ndarray + ndarray Weight array ordered by lag position (maxlag + 1) Notes @@ -124,7 +128,7 @@ def kernel_weight_quadratic_spectral(bw: Numeric, n: int) -> ndarray: Returns ------- - weights : ndarray + ndarray Weight array ordered by lag position (maxlag + 1) Notes @@ -165,7 +169,7 @@ def kernel_weight_parzen(bw: int, *args) -> ndarray: Returns ------- - weights : ndarray + ndarray Weight array ordered by lag position (maxlag + 1) Notes @@ -182,7 +186,7 @@ def kernel_weight_parzen(bw: int, *args) -> ndarray: return w -def kernel_optimal_bandwidth(x: ndarray, kernel: str = 'bartlett') -> int: +def kernel_optimal_bandwidth(x: ndarray, kernel: str = "bartlett") -> int: """ Parameters x : ndarray @@ -196,7 +200,7 @@ def kernel_optimal_bandwidth(x: ndarray, kernel: str = 'bartlett') -> int: Returns ------- - m : int + int Optimal bandwidth. Set to nobs - 1 if computed bandwidth is larger. Notes @@ -215,17 +219,17 @@ def kernel_optimal_bandwidth(x: ndarray, kernel: str = 'bartlett') -> int: """ t = x.shape[0] x = x.squeeze() - if kernel in ('bartlett', 'newey-west'): + if kernel in ("bartlett", "newey-west"): q, c = 1, 1.1447 m_star = int(ceil(4 * (t / 100) ** (2 / 9))) - elif kernel in ('qs', 'andrews', 'quadratic-spectral'): + elif kernel in ("qs", "andrews", "quadratic-spectral"): q, c = 2, 1.3221 m_star = int(ceil(4 * (t / 100) ** (4 / 25))) - elif kernel in ('gallant', 'parzen'): + elif kernel in ("gallant", "parzen"): q, c = 2, 2.6614 m_star = int(ceil(4 * (t / 100) ** (2 / 25))) else: - raise ValueError('Unknown kernel: {0}'.format(kernel)) + raise ValueError("Unknown kernel: {0}".format(kernel)) sigma = empty(m_star + 1) sigma[0] = x.T @ x / t for i in range(1, m_star + 1): @@ -238,13 +242,15 @@ def kernel_optimal_bandwidth(x: ndarray, kernel: str = 'bartlett') -> int: return min(int(ceil(m)), t - 1) -KERNEL_LOOKUP = {'bartlett': kernel_weight_bartlett, - 'newey-west': kernel_weight_bartlett, - 'quadratic-spectral': kernel_weight_quadratic_spectral, - 'qs': kernel_weight_quadratic_spectral, - 'andrews': kernel_weight_quadratic_spectral, - 'gallant': kernel_weight_parzen, - 'parzen': kernel_weight_parzen} # type: Dict[str, KernelWeight] +KERNEL_LOOKUP = { + "bartlett": kernel_weight_bartlett, + "newey-west": kernel_weight_bartlett, + "quadratic-spectral": kernel_weight_quadratic_spectral, + "qs": kernel_weight_quadratic_spectral, + "andrews": kernel_weight_quadratic_spectral, + "gallant": kernel_weight_parzen, + "parzen": kernel_weight_parzen, +} # type: Dict[str, KernelWeight] class HomoskedasticCovariance(object): @@ -290,12 +296,19 @@ class HomoskedasticCovariance(object): :math:`Z` is the matrix of instruments, including exogenous regressors. """ - def __init__(self, x: ndarray, y: ndarray, z: ndarray, params: ndarray, debiased: bool = False, - kappa: Numeric = 1): + def __init__( + self, + x: ndarray, + y: ndarray, + z: ndarray, + params: ndarray, + debiased: bool = False, + kappa: Numeric = 1, + ): if not (x.shape[0] == y.shape[0] == z.shape[0]): - raise ValueError('x, y and z must have the same number of rows') + raise ValueError("x, y and z must have the same number of rows") if not x.shape[1] == len(params): - raise ValueError('x and params must have compatible dimensions') + raise ValueError("x and params must have compatible dimensions") self.x = x self.y = y @@ -307,19 +320,22 @@ def __init__(self, x: ndarray, y: ndarray, z: ndarray, params: ndarray, debiased self._pinvz = pinv(z) nobs, nvar = x.shape self._scale = nobs / (nobs - nvar) if self._debiased else 1 - self._name = 'Unadjusted Covariance (Homoskedastic)' + self._name = "Unadjusted Covariance (Homoskedastic)" def __str__(self) -> str: out = self._name - out += '\nDebiased: {0}'.format(self._debiased) + out += "\nDebiased: {0}".format(self._debiased) if self._kappa != 1: - out += '\nKappa: {0:0.3f}'.format(self._kappa) + out += "\nKappa: {0:0.3f}".format(self._kappa) return out def __repr__(self) -> str: - return self.__str__() + '\n' + \ - self.__class__.__name__ + \ - ', id: {0}'.format(hex(id(self))) + return ( + self.__str__() + + "\n" + + self.__class__.__name__ + + ", id: {0}".format(hex(id(self))) + ) @property def s(self) -> ndarray: @@ -371,8 +387,7 @@ def debiased(self) -> bool: @property def config(self) -> Dict[str, Any]: - return {'debiased': self.debiased, - 'kappa': self._kappa} + return {"debiased": self.debiased, "kappa": self._kappa} class HeteroskedasticCovariance(HomoskedasticCovariance): @@ -420,10 +435,19 @@ class HeteroskedasticCovariance(HomoskedasticCovariance): :math:`Z` is the matrix of instruments, including exogenous regressors. """ - def __init__(self, x: ndarray, y: ndarray, z: ndarray, params: ndarray, debiased: bool = False, - kappa: Numeric = 1): - super(HeteroskedasticCovariance, self).__init__(x, y, z, params, debiased, kappa) - self._name = 'Robust Covariance (Heteroskedastic)' + def __init__( + self, + x: ndarray, + y: ndarray, + z: ndarray, + params: ndarray, + debiased: bool = False, + kappa: Numeric = 1, + ): + super(HeteroskedasticCovariance, self).__init__( + x, y, z, params, debiased, kappa + ) + self._name = "Robust Covariance (Heteroskedastic)" @property def s(self) -> ndarray: @@ -504,25 +528,33 @@ class KernelCovariance(HomoskedasticCovariance): linearmodels.iv.covariance.kernel_weight_quadratic_spectral """ - def __init__(self, x: ndarray, y: ndarray, z: ndarray, params: ndarray, - kernel: str = 'bartlett', - bandwidth: OptionalNumeric = None, debiased: bool = False, kappa: Numeric = 1): + def __init__( + self, + x: ndarray, + y: ndarray, + z: ndarray, + params: ndarray, + kernel: str = "bartlett", + bandwidth: OptionalNumeric = None, + debiased: bool = False, + kappa: Numeric = 1, + ): super(KernelCovariance, self).__init__(x, y, z, params, debiased, kappa) self._kernels = KERNEL_LOOKUP self._kernel = kernel self._bandwidth = bandwidth self._auto_bandwidth = False - self._name = 'Kernel Covariance (HAC)' + self._name = "Kernel Covariance (HAC)" if kernel not in KERNEL_LOOKUP: - raise ValueError('Unknown kernel: {0}'.format(kernel)) + raise ValueError("Unknown kernel: {0}".format(kernel)) def __str__(self) -> str: out = super(KernelCovariance, self).__str__() - out += '\nKernel: {0}'.format(self._kernel) - out += '\nAutomatic Bandwidth: {0}'.format(self._auto_bandwidth) + out += "\nKernel: {0}".format(self._kernel) + out += "\nAutomatic Bandwidth: {0}".format(self._auto_bandwidth) if self._bandwidth: - out += '\nBandwidth: {0}'.format(self._bandwidth) + out += "\nBandwidth: {0}".format(self._bandwidth) return out @property @@ -535,11 +567,12 @@ def s(self) -> ndarray: xhat = z @ (pinvz @ x) xhat_e = xhat * eps - kernel = self.config['kernel'] - bw = self.config['bandwidth'] + kernel = self.config["kernel"] + bw = self.config["bandwidth"] if bw is None: self._auto_bandwidth = True from linearmodels.utility import has_constant + const, loc = has_constant(xhat) sel = ones((xhat.shape[1], 1)) if const: @@ -556,10 +589,12 @@ def s(self) -> ndarray: @property def config(self) -> Dict[str, Any]: - return {'debiased': self.debiased, - 'bandwidth': self._bandwidth, - 'kernel': self._kernel, - 'kappa': self._kappa} + return { + "debiased": self.debiased, + "bandwidth": self._bandwidth, + "kernel": self._kernel, + "kappa": self._kappa, + } class ClusteredCovariance(HomoskedasticCovariance): @@ -614,8 +649,16 @@ class ClusteredCovariance(HomoskedasticCovariance): :math:`Z` is the matrix of instruments, including exogenous regressors. """ - def __init__(self, x: ndarray, y: ndarray, z: ndarray, params: ndarray, - clusters: ndarray = None, debiased: bool = False, kappa: Numeric = 1): + def __init__( + self, + x: ndarray, + y: ndarray, + z: ndarray, + params: ndarray, + clusters: ndarray = None, + debiased: bool = False, + kappa: Numeric = 1, + ): super(ClusteredCovariance, self).__init__(x, y, z, params, debiased, kappa) nobs = x.shape[0] @@ -626,15 +669,18 @@ def __init__(self, x: ndarray, y: ndarray, z: ndarray, params: ndarray, self._num_clusters = [len(unique(clusters))] self._num_clusters_str = str(self._num_clusters[0]) else: - self._num_clusters = [len(unique(clusters[:, 0])), len(unique(clusters[:, 1]))] - self._num_clusters_str = ', '.join(map(str, self._num_clusters)) + self._num_clusters = [ + len(unique(clusters[:, 0])), + len(unique(clusters[:, 1])), + ] + self._num_clusters_str = ", ".join(map(str, self._num_clusters)) if clusters is not None and clusters.shape[0] != nobs: raise ValueError(CLUSTER_ERR.format(nobs, clusters.shape[0])) - self._name = 'Clustered Covariance (One-Way)' + self._name = "Clustered Covariance (One-Way)" def __str__(self) -> str: out = super(ClusteredCovariance, self).__str__() - out += '\nNum Clusters: {0}'.format(self._num_clusters_str) + out += "\nNum Clusters: {0}".format(self._num_clusters_str) return out @property @@ -674,6 +720,8 @@ def rescale(s: ndarray, nc: int, nobs: int) -> ndarray: @property def config(self) -> Dict[str, Any]: - return {'debiased': self.debiased, - 'clusters': self._clusters, - 'kappa': self._kappa} + return { + "debiased": self.debiased, + "clusters": self._clusters, + "kappa": self._kappa, + } diff --git a/linearmodels/iv/data.py b/linearmodels/iv/data.py index d3d68f4a37..71ca0136a0 100644 --- a/linearmodels/iv/data.py +++ b/linearmodels/iv/data.py @@ -22,14 +22,14 @@ except ImportError: HAS_XARRAY = False -dim_err = '{0} has too many dims. Maximum is 2, actual is {1}' -type_err = 'Only ndarrays, DataArrays and Series and DataFrames are supported' +dim_err = "{0} has too many dims. Maximum is 2, actual is {1}" +type_err = "Only ndarrays, DataArrays and Series and DataFrames are supported" def convert_columns(s, drop_first): if is_categorical(s): out = pd.get_dummies(s, drop_first=drop_first) - out.columns = [str(s.name) + '.' + str(c) for c in out] + out.columns = [str(s.name) + "." + str(c) for c in out] return out return s @@ -52,7 +52,7 @@ class IVData(object): Variable name to use when naming variables in NumPy arrays or xarray DataArrays nobs : int, optiona - Number of observation, used when `x` is None. If `x` is array-like, + Number of observation, used when `x` is None. If `x` is array_like, then nobs is used to check the number of observations in `x`. convert_dummies : bool, optional Flat indicating whether pandas categoricals or string input data @@ -61,8 +61,14 @@ class IVData(object): Flag indicating to drop first dummy category """ - def __init__(self, x: OptionalArrayLike, var_name: str = 'x', nobs: int = None, - convert_dummies: bool = True, drop_first: bool = True): + def __init__( + self, + x: OptionalArrayLike, + var_name: str = "x", + nobs: int = None, + convert_dummies: bool = True, + drop_first: bool = True, + ): if isinstance(x, IVData): self.__dict__.update(copy.deepcopy(x.__dict__)) @@ -70,7 +76,7 @@ def __init__(self, x: OptionalArrayLike, var_name: str = 'x', nobs: int = None, if x is None and nobs is not None: x = np.empty((nobs, 0)) elif x is None: - raise ValueError('nobs required when x is None') + raise ValueError("nobs required when x is None") self.original = x xndim = x.ndim @@ -87,7 +93,7 @@ def __init__(self, x: OptionalArrayLike, var_name: str = 'x', nobs: int = None, if x.shape[1] == 1: cols = [var_name] else: - cols = [var_name + '.{0}'.format(i) for i in range(x.shape[1])] + cols = [var_name + ".{0}".format(i) for i in range(x.shape[1])] self._pandas = pd.DataFrame(x, index=index, columns=cols) self._row_labels = index self._col_labels = cols @@ -99,13 +105,15 @@ def __init__(self, x: OptionalArrayLike, var_name: str = 'x', nobs: int = None, copied = False columns = list(x.columns) if len(set(columns)) != len(columns): - raise ValueError('DataFrame contains duplicate column names. ' - 'All column names must be distinct') + raise ValueError( + "DataFrame contains duplicate column names. " + "All column names must be distinct" + ) all_numeric = True for col in x: c = x[col] if is_string_dtype(c.dtype) and c.map(is_string_like).all(): - c = c.astype('category') + c = c.astype("category") if not copied: x = x.copy() copied = True @@ -113,8 +121,9 @@ def __init__(self, x: OptionalArrayLike, var_name: str = 'x', nobs: int = None, dt = c.dtype all_numeric = all_numeric and is_numeric_dtype(dt) if not (is_numeric_dtype(dt) or is_categorical_dtype(dt)): - raise ValueError('Only numeric, string or categorical ' - 'data permitted') + raise ValueError( + "Only numeric, string or categorical " "data permitted" + ) if convert_dummies: x = expand_categoricals(x, drop_first) @@ -138,7 +147,7 @@ def __init__(self, x: OptionalArrayLike, var_name: str = 'x', nobs: int = None, index = list(x.coords[x.dims[0]].values) xr_cols = x.coords[x.dims[1]].values if is_numeric_dtype(xr_cols.dtype): - xr_cols = [var_name + '.{0}'.format(i) for i in range(x.shape[1])] + xr_cols = [var_name + ".{0}".format(i) for i in range(x.shape[1])] xr_cols = list(xr_cols) self._ndarray = x.values.astype(np.float64) self._pandas = pd.DataFrame(self._ndarray, columns=xr_cols, index=index) @@ -149,8 +158,9 @@ def __init__(self, x: OptionalArrayLike, var_name: str = 'x', nobs: int = None, if nobs is not None: if self._ndarray.shape[0] != nobs: - msg = 'Array required to have {nobs} obs, has ' \ - '{act}'.format(nobs=nobs, act=self._ndarray.shape[0]) + msg = "Array required to have {nobs} obs, has " "{act}".format( + nobs=nobs, act=self._ndarray.shape[0] + ) raise ValueError(msg) @property diff --git a/linearmodels/iv/gmm.py b/linearmodels/iv/gmm.py index 6d859ca8b2..70a5d9f850 100644 --- a/linearmodels/iv/gmm.py +++ b/linearmodels/iv/gmm.py @@ -53,7 +53,7 @@ def weight_matrix(self, x, z, eps): Returns ------- - weight : ndarray + ndarray Covariance of GMM moment conditions. """ nobs, nvar = x.shape @@ -70,11 +70,10 @@ def config(self): Returns ------- - config : dict + dict Dictionary containing weight estimator configuration information """ - return {'center': self._center, - 'debiased': self._debiased} + return {"center": self._center, "debiased": self._debiased} class HeteroskedasticWeightMatrix(HomoskedasticWeightMatrix): @@ -117,7 +116,7 @@ def weight_matrix(self, x, z, eps): Returns ------- - weight : ndarray + ndarray Covariance of GMM moment conditions. """ nobs, nvar = x.shape @@ -173,8 +172,14 @@ class KernelWeightMatrix(HomoskedasticWeightMatrix): linearmodels.iv.covariance.kernel_weight_quadratic_spectral """ - def __init__(self, kernel='bartlett', bandwidth=None, center=False, - debiased=False, optimal_bw=False): + def __init__( + self, + kernel="bartlett", + bandwidth=None, + center=False, + debiased=False, + optimal_bw=False, + ): super(KernelWeightMatrix, self).__init__(center, debiased) self._bandwidth = bandwidth self._orig_bandwidth = bandwidth @@ -195,7 +200,7 @@ def weight_matrix(self, x, z, eps): Returns ------- - weight : ndarray + ndarray Covariance of GMM moment conditions. """ nobs, nvar = x.shape @@ -224,13 +229,15 @@ def config(self): Returns ------- - config : dict + dict Dictionary containing weight estimator configuration information """ - return {'center': self._center, - 'bandwidth': self._bandwidth, - 'kernel': self._kernel, - 'debiased': self._debiased} + return { + "center": self._center, + "bandwidth": self._bandwidth, + "kernel": self._kernel, + "debiased": self._debiased, + } @property def bandwidth(self): @@ -270,7 +277,7 @@ def weight_matrix(self, x, z, eps): Returns ------- - weight : ndarray + ndarray Covariance of GMM moment conditions. """ nobs, nvar = x.shape @@ -281,8 +288,10 @@ def weight_matrix(self, x, z, eps): clusters = self._clusters if clusters.shape[0] != nobs: - raise ValueError('clusters has the wrong nobs. Expected {0}, ' - 'got {1}'.format(nobs, clusters.shape[0])) + raise ValueError( + "clusters has the wrong nobs. Expected {0}, " + "got {1}".format(nobs, clusters.shape[0]) + ) clusters = asarray(clusters).copy().squeeze() s = _cov_cluster(ze, clusters) @@ -301,12 +310,14 @@ def config(self): Returns ------- - config : dict + dict Dictionary containing weight estimator configuration information """ - return {'center': self._center, - 'clusters': self._clusters, - 'debiased': self._debiased} + return { + "center": self._center, + "clusters": self._clusters, + "debiased": self._debiased, + } class IVGMMCovariance(HomoskedasticCovariance): @@ -366,46 +377,47 @@ class IVGMMCovariance(HomoskedasticCovariance): """ # TODO: 2-way clustering - def __init__(self, x, y, z, params, w, cov_type='robust', debiased=False, - **cov_config): + def __init__( + self, x, y, z, params, w, cov_type="robust", debiased=False, **cov_config + ): super(IVGMMCovariance, self).__init__(x, y, z, params, debiased) self._cov_type = cov_type self._cov_config = cov_config self.w = w - self._bandwidth = cov_config.get('bandwidth', None) - self._kernel = cov_config.get('kernel', '') - self._name = 'GMM Covariance' - if cov_type in ('robust', 'heteroskedastic'): + self._bandwidth = cov_config.get("bandwidth", None) + self._kernel = cov_config.get("kernel", "") + self._name = "GMM Covariance" + if cov_type in ("robust", "heteroskedastic"): score_cov_estimator = HeteroskedasticWeightMatrix - elif cov_type in ('unadjusted', 'homoskedastic'): + elif cov_type in ("unadjusted", "homoskedastic"): score_cov_estimator = HomoskedasticWeightMatrix - elif cov_type == 'clustered': + elif cov_type == "clustered": score_cov_estimator = OneWayClusteredWeightMatrix - elif cov_type == 'kernel': + elif cov_type == "kernel": score_cov_estimator = KernelWeightMatrix else: - raise ValueError('Unknown cov_type') + raise ValueError("Unknown cov_type") self._score_cov_estimator = score_cov_estimator def __str__(self): out = super(IVGMMCovariance, self).__str__() cov_type = self._cov_type - if cov_type in ('robust', 'heteroskedastic'): - out += '\nRobust (Heteroskedastic)' - elif cov_type in ('unadjusted', 'homoskedastic'): - out += '\nUnadjusted (Homoskedastic)' - elif cov_type == 'clustered': - out += '\nClustered (One-way)' - clusters = self._cov_config.get('clusters', None) + if cov_type in ("robust", "heteroskedastic"): + out += "\nRobust (Heteroskedastic)" + elif cov_type in ("unadjusted", "homoskedastic"): + out += "\nUnadjusted (Homoskedastic)" + elif cov_type == "clustered": + out += "\nClustered (One-way)" + clusters = self._cov_config.get("clusters", None) if clusters is not None: nclusters = len(unique(asarray(clusters))) - out += '\nNum Clusters: {0}'.format(nclusters) + out += "\nNum Clusters: {0}".format(nclusters) else: # kernel - out += '\nKernel (HAC)' - if self._cov_config.get('kernel', False): - out += '\nKernel: {0}'.format(self._cov_config['kernel']) - if self._cov_config.get('bandwidth', False): - out += '\nBandwidth: {0}'.format(self._cov_config['bandwidth']) + out += "\nKernel (HAC)" + if self._cov_config.get("kernel", False): + out += "\nKernel: {0}".format(self._cov_config["kernel"]) + if self._cov_config.get("bandwidth", False): + out += "\nBandwidth: {0}".format(self._cov_config["bandwidth"]) return out @property @@ -416,8 +428,9 @@ def cov(self): xpzw = xpz @ w xpzwzpx_inv = inv(xpzw @ xpz.T) - score_cov = self._score_cov_estimator(debiased=self.debiased, - **self._cov_config) + score_cov = self._score_cov_estimator( + debiased=self.debiased, **self._cov_config + ) s = score_cov.weight_matrix(x, z, eps) self._cov_config = score_cov.config @@ -426,6 +439,6 @@ def cov(self): @property def config(self): - conf = {'debiased': self.debiased} + conf = {"debiased": self.debiased} conf.update(self._cov_config) return conf diff --git a/linearmodels/iv/model.py b/linearmodels/iv/model.py index e407730a41..e1d9220b0d 100644 --- a/linearmodels/iv/model.py +++ b/linearmodels/iv/model.py @@ -28,31 +28,41 @@ IVResultType = Type[Union[IVResults, IVGMMResults, OLSResults]] -__all__ = ['COVARIANCE_ESTIMATORS', 'WEIGHT_MATRICES', 'IVGMM', 'IVLIML', 'IV2SLS', - 'IVGMMCUE', '_OLS'] - -COVARIANCE_ESTIMATORS = {'homoskedastic': HomoskedasticCovariance, - 'unadjusted': HomoskedasticCovariance, - 'HomoskedasticCovariance': HomoskedasticCovariance, - 'homo': HomoskedasticCovariance, - 'robust': HeteroskedasticCovariance, - 'heteroskedastic': HeteroskedasticCovariance, - 'HeteroskedasticCovariance': HeteroskedasticCovariance, - 'hccm': HeteroskedasticCovariance, - 'kernel': KernelCovariance, - 'KernelCovariance': KernelCovariance, - 'one-way': ClusteredCovariance, - 'clustered': ClusteredCovariance, - 'OneWayClusteredCovariance': ClusteredCovariance} - -WEIGHT_MATRICES = {'unadjusted': HomoskedasticWeightMatrix, - 'homoskedastic': HomoskedasticWeightMatrix, - 'robust': HeteroskedasticWeightMatrix, - 'heteroskedastic': HeteroskedasticWeightMatrix, - 'kernel': KernelWeightMatrix, - 'clustered': OneWayClusteredWeightMatrix, - 'one-way': OneWayClusteredWeightMatrix - } +__all__ = [ + "COVARIANCE_ESTIMATORS", + "WEIGHT_MATRICES", + "IVGMM", + "IVLIML", + "IV2SLS", + "IVGMMCUE", + "_OLS", +] + +COVARIANCE_ESTIMATORS = { + "homoskedastic": HomoskedasticCovariance, + "unadjusted": HomoskedasticCovariance, + "HomoskedasticCovariance": HomoskedasticCovariance, + "homo": HomoskedasticCovariance, + "robust": HeteroskedasticCovariance, + "heteroskedastic": HeteroskedasticCovariance, + "HeteroskedasticCovariance": HeteroskedasticCovariance, + "hccm": HeteroskedasticCovariance, + "kernel": KernelCovariance, + "KernelCovariance": KernelCovariance, + "one-way": ClusteredCovariance, + "clustered": ClusteredCovariance, + "OneWayClusteredCovariance": ClusteredCovariance, +} + +WEIGHT_MATRICES = { + "unadjusted": HomoskedasticWeightMatrix, + "homoskedastic": HomoskedasticWeightMatrix, + "robust": HeteroskedasticWeightMatrix, + "heteroskedastic": HeteroskedasticWeightMatrix, + "kernel": KernelWeightMatrix, + "clustered": OneWayClusteredWeightMatrix, + "one-way": OneWayClusteredWeightMatrix, +} class IVLIML(object): @@ -61,15 +71,15 @@ class IVLIML(object): Parameters ---------- - dependent : array-like + dependent : array_like Endogenous variables (nobs by 1) - exog : array-like + exog : array_like Exogenous regressors (nobs by nexog) - endog : array-like + endog : array_like Endogenous regressors (nobs by nendog) - instruments : array-like + instruments : array_like Instrumental variables (nobs by ninstr) - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation fuller : float, optional Fuller's alpha to modify LIML estimator. Default returns unmodified @@ -112,24 +122,31 @@ class IVLIML(object): IV2SLS, IVGMM, IVGMMCUE """ - def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike, - endog: OptionalArrayLike, instruments: OptionalArrayLike, *, - weights: OptionalArrayLike = None, fuller: Numeric = 0, - kappa: OptionalNumeric = None): - - self.dependent = IVData(dependent, var_name='dependent') + def __init__( + self, + dependent: ArrayLike, + exog: OptionalArrayLike, + endog: OptionalArrayLike, + instruments: OptionalArrayLike, + *, + weights: OptionalArrayLike = None, + fuller: Numeric = 0, + kappa: OptionalNumeric = None + ): + + self.dependent = IVData(dependent, var_name="dependent") nobs = self.dependent.shape[0] # type: int - self.exog = IVData(exog, var_name='exog', nobs=nobs) - self.endog = IVData(endog, var_name='endog', nobs=nobs) - self.instruments = IVData(instruments, var_name='instruments', nobs=nobs) + self.exog = IVData(exog, var_name="exog", nobs=nobs) + self.endog = IVData(endog, var_name="endog", nobs=nobs) + self.instruments = IVData(instruments, var_name="instruments", nobs=nobs) self._original_index = self.dependent.pandas.index if weights is None: weights = ones(self.dependent.shape) weights = IVData(weights).ndarray if any(weights <= 0): - raise ValueError('weights must be strictly positive.') + raise ValueError("weights must be strictly positive.") weights = weights / nanmean(weights) - self.weights = IVData(weights, var_name='weights', nobs=nobs) + self.weights = IVData(weights, var_name="weights", nobs=nobs) self._drop_locs = self._drop_missing() # dependent variable @@ -144,44 +161,55 @@ def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike, self._wz = self._z * w self._has_constant = False - self._regressor_is_exog = array([True] * self.exog.shape[1] + - [False] * self.endog.shape[1]) + self._regressor_is_exog = array( + [True] * self.exog.shape[1] + [False] * self.endog.shape[1] + ) self._columns = self.exog.cols + self.endog.cols self._instr_columns = self.exog.cols + self.instruments.cols self._index = self.dependent.rows self._validate_inputs() - if not hasattr(self, '_method'): - self._method = 'IV-LIML' + if not hasattr(self, "_method"): + self._method = "IV-LIML" additional = [] if fuller != 0: - additional.append('fuller(alpha={0})'.format(fuller)) + additional.append("fuller(alpha={0})".format(fuller)) if kappa is not None: - additional.append('kappa={0}'.format(kappa)) + additional.append("kappa={0}".format(kappa)) if additional: - self._method += '(' + ', '.join(additional) + ')' - if not hasattr(self, '_result_container'): + self._method += "(" + ", ".join(additional) + ")" + if not hasattr(self, "_result_container"): self._result_container = IVResults # type: IVResultType self._kappa = kappa self._fuller = fuller if kappa is not None and not isscalar(kappa): - raise ValueError('kappa must be None or a scalar') + raise ValueError("kappa must be None or a scalar") if not isscalar(fuller): - raise ValueError('fuller must be None or a scalar') + raise ValueError("fuller must be None or a scalar") if kappa is not None and fuller != 0: import warnings - warnings.warn('kappa and fuller should not normally be used ' - 'simultaneously. Identical results can be computed ' - 'using kappa only', UserWarning) + + warnings.warn( + "kappa and fuller should not normally be used " + "simultaneously. Identical results can be computed " + "using kappa only", + UserWarning, + ) if endog is None and instruments is None: self._result_container = OLSResults - self._method = 'OLS' - self._formula = '' + self._method = "OLS" + self._formula = "" @staticmethod - def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = None, - fuller: float = 0, kappa: OptionalNumeric = None): + def from_formula( + formula: str, + data: DataFrame, + *, + weights: OptionalArrayLike = None, + fuller: float = 0, + kappa: OptionalNumeric = None + ): """ Parameters ---------- @@ -190,7 +218,7 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = section data : DataFrame DataFrame containing the variables used in the formula - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation fuller : float, optional Fuller's alpha to modify LIML estimator. Default returns unmodified @@ -201,7 +229,7 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = Returns ------- - model : IVLIML + IVLIML Model instance Notes @@ -227,24 +255,31 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = """ parser = IVFormulaParser(formula, data) dep, exog, endog, instr = parser.data - mod = IVLIML(dep, exog, endog, instr, weights=weights, - fuller=fuller, kappa=kappa) + mod = IVLIML( + dep, exog, endog, instr, weights=weights, fuller=fuller, kappa=kappa + ) mod.formula = formula return mod - def predict(self, params: ArrayLike, *, exog: OptionalArrayLike = None, - endog: OptionalArrayLike = None, data: DataFrame = None, - eval_env: int = 4) -> DataFrame: + def predict( + self, + params: ArrayLike, + *, + exog: OptionalArrayLike = None, + endog: OptionalArrayLike = None, + data: DataFrame = None, + eval_env: int = 4 + ) -> DataFrame: """ Predict values for additional data Parameters ---------- - params : array-like + params : array_like Model parameters (nvar by 1) - exog : array-like + exog : array_like Exogenous regressors (nobs by nexog) - endog : array-like + endog : array_like Endogenous regressors (nobs by nendog) data : DataFrame Values to use when making predictions from a model constructed @@ -254,7 +289,7 @@ def predict(self, params: ArrayLike, *, exog: OptionalArrayLike = None, Returns ------- - predictions : DataFrame + DataFrame Fitted values from supplied data and parameters Notes @@ -273,11 +308,14 @@ def predict(self, params: ArrayLike, *, exog: OptionalArrayLike = None, values corresponding to the original model specification. """ if data is not None and self.formula is None: - raise ValueError('Unable to use data when the model was not ' - 'created using a formula.') + raise ValueError( + "Unable to use data when the model was not " "created using a formula." + ) if data is not None and (exog is not None or endog is not None): - raise ValueError('Predictions can only be constructed using one ' - 'of exog/endog or data, but not both.') + raise ValueError( + "Predictions can only be constructed using one " + "of exog/endog or data, but not both." + ) if exog is not None or endog is not None: exog = IVData(exog).pandas endog = IVData(endog).pandas @@ -290,7 +328,7 @@ def predict(self, params: ArrayLike, *, exog: OptionalArrayLike = None, params = atleast_2d(asarray(params)) if params.shape[0] == 1: params = params.T - pred = DataFrame(x @ params, index=exog_endog.index, columns=['predictions']) + pred = DataFrame(x @ params, index=exog_endog.index, columns=["predictions"]) return pred @@ -307,18 +345,19 @@ def formula(self, value: str): def _validate_inputs(self): x, z = self._x, self._z if x.shape[1] == 0: - raise ValueError('Model must contain at least one regressor.') + raise ValueError("Model must contain at least one regressor.") if self.instruments.shape[1] < self.endog.shape[1]: - raise ValueError('The number of instruments ({0}) must be at least ' - 'as large as the number of endogenous regressors' - ' ({1}).'.format(self.instruments.shape[1], - self.endog.shape[1])) + raise ValueError( + "The number of instruments ({0}) must be at least " + "as large as the number of endogenous regressors" + " ({1}).".format(self.instruments.shape[1], self.endog.shape[1]) + ) if matrix_rank(x) < x.shape[1]: - raise ValueError('regressors [exog endog] do not have full ' - 'column rank') + raise ValueError("regressors [exog endog] do not have full " "column rank") if matrix_rank(z) < z.shape[1]: - raise ValueError('instruments [exog instruments] do not have ' - 'full column rank') + raise ValueError( + "instruments [exog instruments] do not have " "full column rank" + ) self._has_constant, self._const_loc = has_constant(x) def _drop_missing(self) -> ndarray: @@ -326,8 +365,10 @@ def _drop_missing(self) -> ndarray: missing = any(c_[[dh.isnull for dh in data]], 0) # type: ndarray if any(missing): if npall(missing): - raise ValueError('All observations contain missing data. ' - 'Model cannot be estimated.') + raise ValueError( + "All observations contain missing data. " + "Model cannot be estimated." + ) self.dependent.drop(missing) self.exog.drop(missing) self.endog.drop(missing) @@ -338,7 +379,9 @@ def _drop_missing(self) -> ndarray: return missing @staticmethod - def estimate_parameters(x: ndarray, y: ndarray, z: ndarray, kappa: Numeric) -> ndarray: + def estimate_parameters( + x: ndarray, y: ndarray, z: ndarray, kappa: Numeric + ) -> ndarray: """ Parameter estimation without error checking @@ -355,7 +398,7 @@ def estimate_parameters(x: ndarray, y: ndarray, z: ndarray, kappa: Numeric) -> n Returns ------- - params : ndarray + ndarray Estimated parameters (nvar by 1) Notes @@ -384,8 +427,9 @@ def _estimate_kappa(self) -> float: q = vpmzv_sqinv @ (ex1.T @ ex1) @ vpmzv_sqinv return min(eigvalsh(q)) - def fit(self, *, cov_type: str = 'robust', debiased: bool = False, - **cov_config: Any): + def fit( + self, *, cov_type: str = "robust", debiased: bool = False, **cov_config: Any + ): """ Estimate model parameters @@ -413,7 +457,7 @@ def fit(self, *, cov_type: str = 'robust', debiased: bool = False, Returns ------- - results : IVResults + IVResults Results container Notes @@ -445,15 +489,14 @@ def fit(self, *, cov_type: str = 'robust', debiased: bool = False, params = self.estimate_parameters(wx, wy, wz, est_kappa) cov_estimator = COVARIANCE_ESTIMATORS[cov_type] - cov_config['debiased'] = debiased - cov_config['kappa'] = est_kappa + cov_config["debiased"] = debiased + cov_config["kappa"] = est_kappa cov_config_copy = {k: v for k, v in cov_config.items()} - if 'center' in cov_config_copy: - del cov_config_copy['center'] + if "center" in cov_config_copy: + del cov_config_copy["center"] cov_estimator = cov_estimator(wx, wy, wz, params, **cov_config_copy) - results = {'kappa': est_kappa, - 'liml_kappa': liml_kappa} + results = {"kappa": est_kappa, "liml_kappa": liml_kappa} pe = self._post_estimation(params, cov_estimator, cov_type) results.update(pe) @@ -470,7 +513,7 @@ def wresids(self, params: ndarray): Returns ------- - wresids : ndarray + ndarray Weighted model residuals Notes @@ -491,7 +534,7 @@ def resids(self, params: ndarray): Returns ------- - resids : ndarray + ndarray Model residuals """ return self._y - self._x @ params @@ -521,12 +564,12 @@ def _post_estimation(self, params: ndarray, cov_estimator, cov_type: str): index = self._index eps = self.resids(params) y = self.dependent.pandas - fitted = DataFrame(asarray(y) - eps, y.index, ['fitted_values']) + fitted = DataFrame(asarray(y) - eps, y.index, ["fitted_values"]) weps = self.wresids(params) cov = cov_estimator.cov debiased = cov_estimator.debiased - residual_ss = (weps.T @ weps) + residual_ss = weps.T @ weps w = self.weights.ndarray e = self._wy @@ -537,24 +580,26 @@ def _post_estimation(self, params: ndarray, cov_estimator, cov_type: str): r2 = 1 - residual_ss / total_ss fstat = self._f_statistic(params, cov, debiased) - out = {'params': Series(params.squeeze(), columns, name='parameter'), - 'eps': Series(eps.squeeze(), index=index, name='residual'), - 'weps': Series(weps.squeeze(), index=index, name='weighted residual'), - 'cov': DataFrame(cov, columns=columns, index=columns), - 's2': float(cov_estimator.s2), - 'debiased': debiased, - 'residual_ss': float(residual_ss), - 'total_ss': float(total_ss), - 'r2': float(r2), - 'fstat': fstat, - 'vars': columns, - 'instruments': self._instr_columns, - 'cov_config': cov_estimator.config, - 'cov_type': cov_type, - 'method': self._method, - 'cov_estimator': cov_estimator, - 'fitted': fitted, - 'original_index': self._original_index} + out = { + "params": Series(params.squeeze(), columns, name="parameter"), + "eps": Series(eps.squeeze(), index=index, name="residual"), + "weps": Series(weps.squeeze(), index=index, name="weighted residual"), + "cov": DataFrame(cov, columns=columns, index=columns), + "s2": float(cov_estimator.s2), + "debiased": debiased, + "residual_ss": float(residual_ss), + "total_ss": float(total_ss), + "r2": float(r2), + "fstat": fstat, + "vars": columns, + "instruments": self._instr_columns, + "cov_config": cov_estimator.config, + "cov_type": cov_type, + "method": self._method, + "cov_estimator": cov_estimator, + "fitted": fitted, + "original_index": self._original_index, + } return out @@ -565,15 +610,15 @@ class IV2SLS(IVLIML): Parameters ---------- - dependent : array-like + dependent : array_like Endogenous variables (nobs by 1) - exog : array-like + exog : array_like Exogenous regressors (nobs by nexog) - endog : array-like + endog : array_like Endogenous regressors (nobs by nendog) - instruments : array-like + instruments : array_like Instrumental variables (nobs by ninstr) - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation Notes @@ -598,15 +643,24 @@ class IV2SLS(IVLIML): IVLIML, IVGMM, IVGMMCUE """ - def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike, - endog: OptionalArrayLike, instruments: OptionalArrayLike, *, - weights: OptionalArrayLike = None): - self._method = 'IV-2SLS' - super(IV2SLS, self).__init__(dependent, exog, endog, instruments, - weights=weights, fuller=0, kappa=1) + def __init__( + self, + dependent: ArrayLike, + exog: OptionalArrayLike, + endog: OptionalArrayLike, + instruments: OptionalArrayLike, + *, + weights: OptionalArrayLike = None + ): + self._method = "IV-2SLS" + super(IV2SLS, self).__init__( + dependent, exog, endog, instruments, weights=weights, fuller=0, kappa=1 + ) @staticmethod - def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = None): + def from_formula( + formula: str, data: DataFrame, *, weights: OptionalArrayLike = None + ): """ Parameters ---------- @@ -615,12 +669,12 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = section data : DataFrame DataFrame containing the variables used in the formula - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation Returns ------- - model : IV2SLS + IV2SLS Model instance Notes @@ -657,15 +711,15 @@ class IVGMM(IVLIML): Parameters ---------- - dependent : array-like + dependent : array_like Endogenous variables (nobs by 1) - exog : array-like + exog : array_like Exogenous regressors (nobs by nexog) - endog : array-like + endog : array_like Endogenous regressors (nobs by nendog) - instruments : array-like + instruments : array_like Instrumental variables (nobs by ninstr) - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation weight_type : str Name of moment condition weight function to use in the GMM estimation @@ -702,21 +756,36 @@ class IVGMM(IVLIML): IV2SLS, IVLIML, IVGMMCUE """ - def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike, - endog: OptionalArrayLike, instruments: OptionalArrayLike, *, - weights: OptionalArrayLike = None, - weight_type: str = 'robust', **weight_config): - self._method = 'IV-GMM' + def __init__( + self, + dependent: ArrayLike, + exog: OptionalArrayLike, + endog: OptionalArrayLike, + instruments: OptionalArrayLike, + *, + weights: OptionalArrayLike = None, + weight_type: str = "robust", + **weight_config + ): + self._method = "IV-GMM" self._result_container = IVGMMResults - super(IVGMM, self).__init__(dependent, exog, endog, instruments, weights=weights) + super(IVGMM, self).__init__( + dependent, exog, endog, instruments, weights=weights + ) weight_matrix_estimator = WEIGHT_MATRICES[weight_type] self._weight = weight_matrix_estimator(**weight_config) self._weight_type = weight_type self._weight_config = self._weight.config @staticmethod - def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = None, - weight_type: str = 'robust', **weight_config: Any): + def from_formula( + formula: str, + data: DataFrame, + *, + weights: OptionalArrayLike = None, + weight_type: str = "robust", + **weight_config: Any + ): """ Parameters ---------- @@ -725,7 +794,7 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = section data : DataFrame DataFrame containing the variables used in the formula - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation weight_type : str Name of moment condition weight function to use in the GMM estimation @@ -747,7 +816,7 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = Returns ------- - model : IVGMM + IVGMM Model instance Examples @@ -761,8 +830,15 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = """ parser = IVFormulaParser(formula, data) dep, exog, endog, instr = parser.data - mod = IVGMM(dep, exog, endog, instr, weights=weights, weight_type=weight_type, - **weight_config) + mod = IVGMM( + dep, + exog, + endog, + instr, + weights=weights, + weight_type=weight_type, + **weight_config + ) mod.formula = formula return mod @@ -782,7 +858,7 @@ def estimate_parameters(x: ndarray, y: ndarray, z: ndarray, w: ndarray): Returns ------- - params : ndarray + ndarray Estimated parameters (nvar by 1) Notes @@ -794,8 +870,16 @@ def estimate_parameters(x: ndarray, y: ndarray, z: ndarray, w: ndarray): zpy = z.T @ y return inv(xpz @ w @ xpz.T) @ (xpz @ w @ zpy) - def fit(self, *, iter_limit: int = 2, tol: float = 1e-4, initial_weight: ndarray = None, - cov_type: str = 'robust', debiased: bool = False, **cov_config: Any): + def fit( + self, + *, + iter_limit: int = 2, + tol: float = 1e-4, + initial_weight: ndarray = None, + cov_type: str = "robust", + debiased: bool = False, + **cov_config: Any + ): """ Estimate model parameters @@ -837,7 +921,7 @@ def fit(self, *, iter_limit: int = 2, tol: float = 1e-4, initial_weight: ndarray Returns ------- - results : IVGMMResults + IVGMMResults Results container See also @@ -848,8 +932,13 @@ def fit(self, *, iter_limit: int = 2, tol: float = 1e-4, initial_weight: ndarray nobs = wy.shape[0] weight_matrix = self._weight.weight_matrix wmat = inv(wz.T @ wz / nobs) if initial_weight is None else initial_weight - sv = IV2SLS(self.dependent, self.exog, self.endog, self.instruments, - weights=self.weights) + sv = IV2SLS( + self.dependent, + self.exog, + self.endog, + self.instruments, + weights=self.weights, + ) _params = params = asarray(sv.fit().params)[:, None] # _params = params = self.estimate_parameters(wx, wy, wz, wmat) @@ -867,9 +956,10 @@ def fit(self, *, iter_limit: int = 2, tol: float = 1e-4, initial_weight: ndarray norm = delta.T @ vinv @ delta iters += 1 - cov_config['debiased'] = debiased - cov_estimator = IVGMMCovariance(wx, wy, wz, params, wmat, - cov_type, **cov_config) + cov_config["debiased"] = debiased + cov_estimator = IVGMMCovariance( + wx, wy, wz, params, wmat, cov_type, **cov_config + ) results = self._post_estimation(params, cov_estimator, cov_type) gmm_pe = self._gmm_post_estimation(params, wmat, iters) @@ -881,11 +971,13 @@ def fit(self, *, iter_limit: int = 2, tol: float = 1e-4, initial_weight: ndarray def _gmm_post_estimation(self, params: ndarray, weight_mat: ndarray, iters: int): """GMM-specific post-estimation results""" instr = self._instr_columns - gmm_specific = {'weight_mat': DataFrame(weight_mat, columns=instr, index=instr), - 'weight_type': self._weight_type, - 'weight_config': self._weight_type, - 'iterations': iters, - 'j_stat': self._j_statistic(params, weight_mat)} + gmm_specific = { + "weight_mat": DataFrame(weight_mat, columns=instr, index=instr), + "weight_type": self._weight_type, + "weight_config": self._weight_type, + "iterations": iters, + "j_stat": self._j_statistic(params, weight_mat), + } return gmm_specific @@ -896,7 +988,7 @@ def _j_statistic(self, params: ndarray, weight_mat: ndarray): eps = y - x @ params g_bar = (z * eps).mean(0) stat = float(nobs * g_bar.T @ weight_mat @ g_bar.T) - null = 'Expected moment conditions are equal to 0' + null = "Expected moment conditions are equal to 0" return WaldTestStatistic(stat, null, ninstr - nvar) @@ -906,15 +998,15 @@ class IVGMMCUE(IVGMM): Parameters ---------- - dependent : array-like + dependent : array_like Endogenous variables (nobs by 1) - exog : array-like + exog : array_like Exogenous regressors (nobs by nexog) - endog : array-like + endog : array_like Endogenous regressors (nobs by nendog) - instruments : array-like + instruments : array_like Instrumental variables (nobs by ninstr) - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation weight_type : str Name of moment condition weight function to use in the GMM estimation @@ -949,19 +1041,39 @@ class IVGMMCUE(IVGMM): IV2SLS, IVLIML, IVGMM """ - def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike, - endog: OptionalArrayLike, instruments: OptionalArrayLike, *, - weights: OptionalArrayLike = None, - weight_type: str = 'robust', **weight_config): - self._method = 'IV-GMM-CUE' - super(IVGMMCUE, self).__init__(dependent, exog, endog, instruments, weights=weights, - weight_type=weight_type, **weight_config) - if 'center' not in weight_config: - weight_config['center'] = True + def __init__( + self, + dependent: ArrayLike, + exog: OptionalArrayLike, + endog: OptionalArrayLike, + instruments: OptionalArrayLike, + *, + weights: OptionalArrayLike = None, + weight_type: str = "robust", + **weight_config + ): + self._method = "IV-GMM-CUE" + super(IVGMMCUE, self).__init__( + dependent, + exog, + endog, + instruments, + weights=weights, + weight_type=weight_type, + **weight_config + ) + if "center" not in weight_config: + weight_config["center"] = True @staticmethod - def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = None, - weight_type: str = 'robust', **weight_config: Any): + def from_formula( + formula: str, + data: DataFrame, + *, + weights: OptionalArrayLike = None, + weight_type: str = "robust", + **weight_config: Any + ): """ Parameters ---------- @@ -970,7 +1082,7 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = section data : DataFrame DataFrame containing the variables used in the formula - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation weight_type : str Name of moment condition weight function to use in the GMM estimation @@ -980,7 +1092,7 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = Returns ------- - model : IVGMMCUE + IVGMMCUE Model instance Notes @@ -1005,8 +1117,15 @@ def from_formula(formula: str, data: DataFrame, *, weights: OptionalArrayLike = """ parser = IVFormulaParser(formula, data) dep, exog, endog, instr = parser.data - mod = IVGMMCUE(dep, exog, endog, instr, weights=weights, weight_type=weight_type, - **weight_config) + mod = IVGMMCUE( + dep, + exog, + endog, + instr, + weights=weights, + weight_type=weight_type, + **weight_config + ) mod.formula = formula return mod @@ -1027,7 +1146,7 @@ def j(self, params: ndarray, x: ndarray, y: ndarray, z: ndarray): Returns ------- - j : float + float GMM objective function, also known as the J statistic Notes @@ -1055,8 +1174,15 @@ def j(self, params: ndarray, x: ndarray, y: ndarray, z: ndarray): g_bar = (z * eps).mean(0) return nobs * g_bar.T @ w @ g_bar.T - def estimate_parameters(self, starting: ndarray, x: ndarray, y: ndarray, z: ndarray, - display: bool = False, opt_options: Dict[str, Any] = None): + def estimate_parameters( + self, + starting: ndarray, + x: ndarray, + y: ndarray, + z: ndarray, + display: bool = False, + opt_options: Dict[str, Any] = None, + ): r""" Parameters ---------- @@ -1076,7 +1202,7 @@ def estimate_parameters(self, starting: ndarray, x: ndarray, y: ndarray, z: ndar Returns ------- - params : ndarray + ndarray Estimated parameters (nvar by 1) Notes @@ -1090,18 +1216,25 @@ def estimate_parameters(self, starting: ndarray, x: ndarray, y: ndarray, z: ndar """ args = (x, y, z) opt_options = {} if opt_options is None else opt_options - options = {'disp': display} - if 'options' in opt_options: + options = {"disp": display} + if "options" in opt_options: opt_options = opt_options.copy() - options.update(opt_options.pop('options')) + options.update(opt_options.pop("options")) res = minimize(self.j, starting, args=args, options=options, **opt_options) return res.x[:, None], res.nit - def fit(self, *, starting: ndarray = None, display: bool = False, cov_type: str = 'robust', - debiased: bool = False, opt_options: Dict[str, Any] = None, - **cov_config: Any): + def fit( + self, + *, + starting: ndarray = None, + display: bool = False, + cov_type: str = "robust", + debiased: bool = False, + opt_options: Dict[str, Any] = None, + **cov_config: Any + ): r""" Estimate model parameters @@ -1130,7 +1263,7 @@ def fit(self, *, starting: ndarray = None, display: bool = False, cov_type: str Returns ------- - results : IVGMMResults + IVGMMResults Results container Notes @@ -1146,26 +1279,34 @@ def fit(self, *, starting: ndarray = None, display: bool = False, cov_type: str if starting is None: exog = None if self.exog.shape[1] == 0 else self.exog endog = None if self.endog.shape[1] == 0 else self.endog - instr = None if self.instruments.shape[1] == 0 else \ - self.instruments - - res = IVGMM(self.dependent, exog, endog, instr, - weights=self.weights, weight_type=self._weight_type, - **self._weight_config).fit() + instr = None if self.instruments.shape[1] == 0 else self.instruments + + res = IVGMM( + self.dependent, + exog, + endog, + instr, + weights=self.weights, + weight_type=self._weight_type, + **self._weight_config + ).fit() starting = asarray(res.params) else: starting = asarray(starting) if len(starting) != self.exog.shape[1] + self.endog.shape[1]: - raise ValueError('starting does not have the correct number ' - 'of values') - params, iters = self.estimate_parameters(starting, wx, wy, wz, display, - opt_options=opt_options) + raise ValueError( + "starting does not have the correct number " "of values" + ) + params, iters = self.estimate_parameters( + starting, wx, wy, wz, display, opt_options=opt_options + ) eps = wy - wx @ params wmat = inv(weight_matrix(wx, wz, eps)) - cov_config['debiased'] = debiased - cov_estimator = IVGMMCovariance(wx, wy, wz, params, wmat, cov_type, - **cov_config) + cov_config["debiased"] = debiased + cov_estimator = IVGMMCovariance( + wx, wy, wz, params, wmat, cov_type, **cov_config + ) results = self._post_estimation(params, cov_estimator, cov_type) gmm_pe = self._gmm_post_estimation(params, wmat, iters) results.update(gmm_pe) @@ -1179,11 +1320,11 @@ class _OLS(IVLIML): Parameters ---------- - dependent : array-like + dependent : array_like Endogenous variables (nobs by 1) - exog : array-like + exog : array_like Exogenous regressors (nobs by nexog) - weights : array-like, optional + weights : array_like, optional Observation weights used in estimation Notes @@ -1197,7 +1338,14 @@ class _OLS(IVLIML): statsmodels.regression.linear_model.GLS """ - def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike, *, - weights: OptionalArrayLike = None): - super(_OLS, self).__init__(dependent, exog, None, None, weights=weights, kappa=0.0) + def __init__( + self, + dependent: ArrayLike, + exog: OptionalArrayLike, + *, + weights: OptionalArrayLike = None + ): + super(_OLS, self).__init__( + dependent, exog, None, None, weights=weights, kappa=0.0 + ) self._result_container = OLSResults diff --git a/linearmodels/iv/results.py b/linearmodels/iv/results.py index cb4369212a..6c19ce367b 100644 --- a/linearmodels/iv/results.py +++ b/linearmodels/iv/results.py @@ -7,11 +7,11 @@ import datetime as dt from typing import Any, Dict, List, Union -from property_cached import cached_property from numpy import (array, c_, diag, empty, isnan, log, ndarray, ones, sqrt, zeros) from numpy.linalg import inv, pinv from pandas import DataFrame, Series, concat, to_numeric +from property_cached import cached_property import scipy.stats as stats from statsmodels.iolib.summary import SimpleTable, fmt_2cols, fmt_params from statsmodels.iolib.table import default_txt_fmt @@ -22,7 +22,7 @@ pval_format, quadratic_form_test) -def stub_concat(lists, sep='='): +def stub_concat(lists, sep="="): col_size = max([max(map(len, l)) for l in lists]) out = [] for l in lists: @@ -31,7 +31,7 @@ def stub_concat(lists, sep='='): return out[:-1] -def table_concat(lists, sep='='): +def table_concat(lists, sep="="): col_sizes = [] for l in lists: size = list(map(lambda r: list(map(len, r)), l)) @@ -58,27 +58,27 @@ class OLSResults(_SummaryStr): """ def __init__(self, results: Dict[str, Any], model): - self._resid = results['eps'] - self._wresid = results['weps'] - self._params = results['params'] - self._cov = results['cov'] + self._resid = results["eps"] + self._wresid = results["weps"] + self._params = results["params"] + self._cov = results["cov"] self.model = model - self._r2 = results['r2'] - self._cov_type = results['cov_type'] - self._rss = results['residual_ss'] - self._tss = results['total_ss'] - self._s2 = results['s2'] - self._debiased = results['debiased'] - self._f_statistic = results['fstat'] - self._vars = results['vars'] - self._cov_config = results['cov_config'] - self._method = results['method'] - self._kappa = results.get('kappa', None) + self._r2 = results["r2"] + self._cov_type = results["cov_type"] + self._rss = results["residual_ss"] + self._tss = results["total_ss"] + self._s2 = results["s2"] + self._debiased = results["debiased"] + self._f_statistic = results["fstat"] + self._vars = results["vars"] + self._cov_config = results["cov_config"] + self._method = results["method"] + self._kappa = results.get("kappa", None) self._datetime = dt.datetime.now() - self._cov_estimator = results['cov_estimator'] - self._original_index = results['original_index'] - self._fitted = results['fitted'] - self._df_model = results.get('df_model', self._params.shape[0]) + self._cov_estimator = results["cov_estimator"] + self._original_index = results["original_index"] + self._fitted = results["fitted"] + self._df_model = results.get("df_model", self._params.shape[0]) @property def cov_config(self) -> Dict[str, Any]: @@ -128,23 +128,33 @@ def idiosyncratic(self) -> Series: def _out_of_sample(self, exog, endog, data, fitted, missing): """Interface between model predict and predict for OOS fits""" if not (exog is None and endog is None) and data is not None: - raise ValueError('Predictions can only be constructed using one ' - 'of exog/endog or data, but not both.') + raise ValueError( + "Predictions can only be constructed using one " + "of exog/endog or data, but not both." + ) pred = self.model.predict(self.params, exog=exog, endog=endog, data=data) if not missing: pred = pred.loc[pred.notnull().all(1)] return pred - def predict(self, exog=None, endog=None, *, data=None, fitted=True, - idiosyncratic=False, missing=False): + def predict( + self, + exog=None, + endog=None, + *, + data=None, + fitted=True, + idiosyncratic=False, + missing=False + ): """ In- and out-of-sample predictions Parameters ---------- - exog : array-like + exog : array_like Exogenous values to use in out-of-sample prediction (nobs by nexog) - endog : array-like + endog : array_like Endogenous values to use in out-of-sample prediction (nobs by nendog) data : DataFrame, optional DataFrame to use for out-of-sample predictions when model was @@ -161,7 +171,7 @@ def predict(self, exog=None, endog=None, *, data=None, fitted=True, Returns ------- - predictions : DataFrame + DataFrame DataFrame containing columns for all selected outputs Notes @@ -184,7 +194,7 @@ def predict(self, exog=None, endog=None, *, data=None, fitted=True, if idiosyncratic: out.append(self.idiosyncratic) if len(out) == 0: - raise ValueError('At least one output must be selected') + raise ValueError("At least one output must be selected") out = concat(out, 1) # type: DataFrame if missing: index = self._original_index @@ -241,12 +251,12 @@ def cov_type(self) -> str: def std_errors(self) -> Series: """Estimated parameter standard errors""" std_errors = sqrt(diag(self.cov)) - return Series(std_errors, index=self._vars, name='stderr') + return Series(std_errors, index=self._vars, name="stderr") @cached_property def tstats(self) -> Series: """Parameter t-statistics""" - return Series(self._params / self.std_errors, name='tstat') + return Series(self._params / self.std_errors, name="tstat") @cached_property def pvalues(self) -> Series: @@ -258,7 +268,7 @@ def pvalues(self) -> Series: else: pvals = 2 - 2 * stats.norm.cdf(abs(self.tstats)) - return Series(pvals, index=self._vars, name='pvalue') + return Series(pvals, index=self._vars, name="pvalue") @property def total_ss(self) -> float: @@ -292,7 +302,7 @@ def f_statistic(self) -> WaldTestStatistic: Returns ------- - f : WaldTestStatistic + WaldTestStatistic Test statistic for null all coefficients excluding constant terms are zero. @@ -324,7 +334,7 @@ def conf_int(self, level=0.95) -> DataFrame: Returns ------- - ci : DataFrame + DataFrame Confidence interval of the form [lower, upper] for each parameters Notes @@ -338,20 +348,22 @@ def conf_int(self, level=0.95) -> DataFrame: q = stats.norm.ppf(ci_quantiles) q = q[None, :] ci = self.params[:, None] + self.std_errors[:, None] * q - return DataFrame(ci, index=self._vars, columns=['lower', 'upper']) + return DataFrame(ci, index=self._vars, columns=["lower", "upper"]) def _top_right(self): f_stat = _str(self.f_statistic.stat) if isnan(self.f_statistic.stat): - f_stat = ' N/A' - - return [('R-squared:', _str(self.rsquared)), - ('Adj. R-squared:', _str(self.rsquared_adj)), - ('F-statistic:', f_stat), - ('P-value (F-stat)', pval_format(self.f_statistic.pval)), - ('Distribution:', str(self.f_statistic.dist_name)), - ('', ''), - ('', '')] + f_stat = " N/A" + + return [ + ("R-squared:", _str(self.rsquared)), + ("Adj. R-squared:", _str(self.rsquared_adj)), + ("F-statistic:", f_stat), + ("P-value (F-stat)", pval_format(self.f_statistic.pval)), + ("Distribution:", str(self.f_statistic.dist_name)), + ("", ""), + ("", ""), + ] @property def summary(self) -> Summary: @@ -361,15 +373,17 @@ def summary(self) -> Summary: ``summary.as_html()`` and ``summary.as_latex()``. """ - title = self._method + ' Estimation Summary' + title = self._method + " Estimation Summary" mod = self.model - top_left = [('Dep. Variable:', mod.dependent.cols[0]), - ('Estimator:', self._method), - ('No. Observations:', self.nobs), - ('Date:', self._datetime.strftime('%a, %b %d %Y')), - ('Time:', self._datetime.strftime('%H:%M:%S')), - ('Cov. Estimator:', self._cov_type), - ('', '')] + top_left = [ + ("Dep. Variable:", mod.dependent.cols[0]), + ("Estimator:", self._method), + ("No. Observations:", self.nobs), + ("Date:", self._datetime.strftime("%a, %b %d %Y")), + ("Time:", self._datetime.strftime("%H:%M:%S")), + ("Cov. Estimator:", self._cov_type), + ("", ""), + ] top_right = self._top_right() @@ -385,9 +399,9 @@ def summary(self) -> Summary: # Top Table # Parameter table fmt = fmt_2cols - fmt['data_fmts'][1] = '%18s' + fmt["data_fmts"][1] = "%18s" - top_right = [('%-21s' % (' ' + k), v) for k, v in top_right] + top_right = [("%-21s" % (" " + k), v) for k, v in top_right] stubs = [] vals = [] for stub, val in top_right: @@ -396,11 +410,13 @@ def summary(self) -> Summary: table.extend_right(SimpleTable(vals, stubs=stubs)) smry.tables.append(table) - param_data = c_[self.params.values[:, None], - self.std_errors.values[:, None], - self.tstats.values[:, None], - self.pvalues.values[:, None], - self.conf_int()] + param_data = c_[ + self.params.values[:, None], + self.std_errors.values[:, None], + self.tstats.values[:, None], + self.pvalues.values[:, None], + self.conf_int(), + ] data = [] for row in param_data: txt_row = [] @@ -410,35 +426,42 @@ def summary(self) -> Summary: f = pval_format txt_row.append(f(v)) data.append(txt_row) - title = 'Parameter Estimates' + title = "Parameter Estimates" table_stubs = list(self.params.index) extra_text = [] if table_stubs: - header = ['Parameter', 'Std. Err.', 'T-stat', 'P-value', 'Lower CI', 'Upper CI'] - table = SimpleTable(data, - stubs=table_stubs, - txt_fmt=fmt_params, - headers=header, - title=title) + header = [ + "Parameter", + "Std. Err.", + "T-stat", + "P-value", + "Lower CI", + "Upper CI", + ] + table = SimpleTable( + data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title + ) smry.tables.append(table) else: - extra_text.append('Model contains no parameters') + extra_text.append("Model contains no parameters") instruments = self.model.instruments if instruments.shape[1] > 0: endog = self.model.endog - extra_text.append('Endogenous: ' + ', '.join(endog.cols)) - extra_text.append('Instruments: ' + ', '.join(instruments.cols)) + extra_text.append("Endogenous: " + ", ".join(endog.cols)) + extra_text.append("Instruments: " + ", ".join(instruments.cols)) cov_descr = str(self._cov_estimator) - for line in cov_descr.split('\n'): + for line in cov_descr.split("\n"): extra_text.append(line) if extra_text: smry.add_extra_txt(extra_text) return smry - def wald_test(self, restriction=None, value=None, *, formula=None) -> WaldTestStatistic: + def wald_test( + self, restriction=None, value=None, *, formula=None + ) -> WaldTestStatistic: r""" Test linear equality constraints using a Wald test @@ -463,7 +486,7 @@ def wald_test(self, restriction=None, value=None, *, formula=None) -> WaldTestSt Returns ------- - t: WaldTestStatistic + WaldTestStatistic Test statistic for null that restrictions are valid. Notes @@ -497,29 +520,35 @@ def wald_test(self, restriction=None, value=None, *, formula=None) -> WaldTestSt >>> res.wald_test(formula=['exper = 0', 'I(exper ** 2) = 0']) """ - return quadratic_form_test(self._params, self.cov, restriction=restriction, - value=value, formula=formula) + return quadratic_form_test( + self._params, + self.cov, + restriction=restriction, + value=value, + formula=formula, + ) class AbsorbingLSResults(OLSResults): def __init__(self, results: Dict[str, Any], model): super(AbsorbingLSResults, self).__init__(results, model) - self._absorbed_rsquared = results['absorbed_r2'] - self._absorbed_effects = results['absorbed_effects'] + self._absorbed_rsquared = results["absorbed_r2"] + self._absorbed_effects = results["absorbed_effects"] def _top_right(self): f_stat = _str(self.f_statistic.stat) if isnan(self.f_statistic.stat): - f_stat = ' N/A' - - return [('R-squared:', _str(self.rsquared)), - ('Adj. R-squared:', _str(self.rsquared_adj)), - ('F-statistic:', f_stat), - ('P-value (F-stat):', pval_format(self.f_statistic.pval)), - ('Distribution:', str(self.f_statistic.dist_name)), - ('R-squared (No Effects):', _str(round(self.absorbed_rsquared, 5))), - ('Varaibles Absorbed:', _str(self.df_absorbed)) - ] + f_stat = " N/A" + + return [ + ("R-squared:", _str(self.rsquared)), + ("Adj. R-squared:", _str(self.rsquared_adj)), + ("F-statistic:", f_stat), + ("P-value (F-stat):", pval_format(self.f_statistic.pval)), + ("Distribution:", str(self.f_statistic.dist_name)), + ("R-squared (No Effects):", _str(round(self.absorbed_rsquared, 5))), + ("Varaibles Absorbed:", _str(self.df_absorbed)), + ] @property def absorbed_rsquared(self) -> float: @@ -561,7 +590,7 @@ def diagnostics(self) -> DataFrame: Returns ------- - res : DataFrame + DataFrame DataFrame where each endogenous variable appears as a row and the columns contain alternative measures. The columns are: @@ -584,6 +613,7 @@ def diagnostics(self) -> DataFrame: is with respect to the other included variables in the model. """ from linearmodels.iv.model import _OLS, IV2SLS + endog, exog, instr, weights = self.endog, self.exog, self.instr, self.weights w = sqrt(weights.ndarray) z = w * instr.ndarray @@ -606,25 +636,36 @@ def diagnostics(self) -> DataFrame: params = params[:, None] stat = params.T @ inv(res.cov) @ params stat = float(stat.squeeze()) - w_test = WaldTestStatistic(stat, null='', df=params.shape[0]) - inner = {'rsquared': individual_results[col].rsquared, - 'partial.rsquared': res.rsquared, - 'f.stat': w_test.stat, - 'f.pval': w_test.pval, - 'f.dist': w_test.dist_name} + w_test = WaldTestStatistic(stat, null="", df=params.shape[0]) + inner = { + "rsquared": individual_results[col].rsquared, + "partial.rsquared": res.rsquared, + "f.stat": w_test.stat, + "f.pval": w_test.pval, + "f.dist": w_test.dist_name, + } out[col] = Series(inner) out_df = DataFrame(out).T dep = self.dep - r2sls = IV2SLS(dep, exog, endog, instr, weights=weights).fit(cov_type='unadjusted') - rols = _OLS(dep, self._reg, weights=weights).fit(cov_type='unadjusted') + r2sls = IV2SLS(dep, exog, endog, instr, weights=weights).fit( + cov_type="unadjusted" + ) + rols = _OLS(dep, self._reg, weights=weights).fit(cov_type="unadjusted") shea = (rols.std_errors / r2sls.std_errors) ** 2 shea *= (1 - r2sls.rsquared) / (1 - rols.rsquared) - out_df['shea.rsquared'] = shea[out_df.index] - cols = ['rsquared', 'partial.rsquared', 'shea.rsquared', 'f.stat', 'f.pval', 'f.dist'] + out_df["shea.rsquared"] = shea[out_df.index] + cols = [ + "rsquared", + "partial.rsquared", + "shea.rsquared", + "f.stat", + "f.pval", + "f.dist", + ] out_df = out_df[cols] for c in out_df: - out_df[c] = to_numeric(out_df[c], errors='ignore') + out_df[c] = to_numeric(out_df[c], errors="ignore") return out_df @@ -635,11 +676,12 @@ def individual(self) -> Dict[str, OLSResults]: Returns ------- - res : dict + dict Dictionary containing first stage estimation results. Keys are the variable names of the endogenous regressors. """ from linearmodels.iv.model import _OLS + w = sqrt(self.weights.ndarray) exog_instr = w * c_[self.exog.ndarray, self.instr.ndarray] exog_instr = DataFrame(exog_instr, columns=self.exog.cols + self.instr.cols) @@ -658,17 +700,19 @@ def summary(self) -> Summary: Supports export to csv, html and latex using the methods ``summary.as_csv()``, ``summary.as_html()`` and ``summary.as_latex()``. """ - stubs_lookup = {'rsquared': 'R-squared', - 'partial.rsquared': 'Partial R-squared', - 'shea.rsquared': 'Shea\'s R-squared', - 'f.stat': 'Partial F-statistic', - 'f.pval': 'P-value (Partial F-stat)', - 'f.dist': 'Partial F-stat Distn'} + stubs_lookup = { + "rsquared": "R-squared", + "partial.rsquared": "Partial R-squared", + "shea.rsquared": "Shea's R-squared", + "f.stat": "Partial F-statistic", + "f.pval": "P-value (Partial F-stat)", + "f.dist": "Partial F-stat Distn", + } smry = Summary() diagnostics = self.diagnostics vals = [] for c in diagnostics: - if c != 'f.dist': + if c != "f.dist": vals.append([_str(v) for v in diagnostics[c]]) else: vals.append([v for v in diagnostics[c]]) @@ -684,24 +728,28 @@ def summary(self) -> Summary: params_fmt = [[_str(val) for val in row] for row in params_arr.T] for i in range(1, len(params_fmt), 2): for j in range(len(params_fmt[i])): - params_fmt[i][j] = '({0})'.format(params_fmt[i][j]) + params_fmt[i][j] = "({0})".format(params_fmt[i][j]) params_stub = [] for var in res.params.index: - params_stub.extend([var, '']) + params_stub.extend([var, ""]) - title = 'First Stage Estimation Results' + title = "First Stage Estimation Results" vals = table_concat((vals, params_fmt)) stubs = stub_concat((stubs, params_stub)) txt_fmt = default_txt_fmt.copy() - txt_fmt['data_aligns'] = 'r' - txt_fmt['header_align'] = 'r' - table = SimpleTable(vals, headers=header, title=title, stubs=stubs, txt_fmt=txt_fmt) + txt_fmt["data_aligns"] = "r" + txt_fmt["header_align"] = "r" + table = SimpleTable( + vals, headers=header, title=title, stubs=stubs, txt_fmt=txt_fmt + ) smry.tables.append(table) - extra_txt = ['T-stats reported in parentheses', - 'T-stats use same covariance type as original model'] + extra_txt = [ + "T-stats reported in parentheses", + "T-stats use same covariance type as original model", + ] smry.add_extra_txt(extra_txt) return smry @@ -713,7 +761,7 @@ class _CommonIVResults(OLSResults): def __init__(self, results: Dict[str, Any], model): super(_CommonIVResults, self).__init__(results, model) - self._liml_kappa = results.get('liml_kappa', None) + self._liml_kappa = results.get("liml_kappa", None) @property def first_stage(self) -> FirstStageResults: @@ -722,13 +770,18 @@ def first_stage(self) -> FirstStageResults: Returns ------- - first : FirstStageResults + FirstStageResults Object containing results for diagnosing instrument relevance issues. """ - return FirstStageResults(self.model.dependent, self.model.exog, - self.model.endog, self.model.instruments, - self.model.weights, self._cov_type, - self._cov_config) + return FirstStageResults( + self.model.dependent, + self.model.exog, + self.model.endog, + self.model.instruments, + self.model.weights, + self._cov_type, + self._cov_config, + ) class IVResults(_CommonIVResults): @@ -745,7 +798,7 @@ class IVResults(_CommonIVResults): def __init__(self, results: Dict[str, Any], model): super(IVResults, self).__init__(results, model) - self._kappa = results.get('kappa', 1) + self._kappa = results.get("kappa", 1) @cached_property def sargan(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: @@ -754,7 +807,7 @@ def sargan(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -781,15 +834,17 @@ def sargan(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: z = self.model.instruments.ndarray nobs, ninstr = z.shape nendog = self.model.endog.shape[1] - name = 'Sargan\'s test of overidentification' + name = "Sargan's test of overidentification" if ninstr - nendog == 0: - return InvalidTestStatistic('Test requires more instruments than ' - 'endogenous variables.', name=name) + return InvalidTestStatistic( + "Test requires more instruments than " "endogenous variables.", + name=name, + ) eps = self.resids.values[:, None] u = annihilate(eps, self.model._z) stat = nobs * (1 - (u.T @ u) / (eps.T @ eps)).squeeze() - null = 'The model is not overidentified.' + null = "The model is not overidentified." return WaldTestStatistic(stat, null, ninstr - nendog, name=name) @@ -800,7 +855,7 @@ def basmann(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -828,10 +883,12 @@ def basmann(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: ninstr = mod.instruments.shape[1] nobs, nendog = mod.endog.shape nz = mod._z.shape[1] - name = 'Basmann\'s test of overidentification' + name = "Basmann's test of overidentification" if ninstr - nendog == 0: - return InvalidTestStatistic('Test requires more instruments than ' - 'endogenous variables.', name=name) + return InvalidTestStatistic( + "Test requires more instruments than " "endogenous variables.", + name=name, + ) sargan_test = self.sargan s = sargan_test.stat stat = s * (nobs - nz) / (nobs - s) @@ -856,8 +913,10 @@ def _endogeneity_setup(self, var_names=None): ntested = assumed_exog.shape[1] from linearmodels.iv import IV2SLS - mod = IV2SLS(self.model.dependent, aug_exog, still_endog, - self.model.instruments) + + mod = IV2SLS( + self.model.dependent, aug_exog, still_endog, self.model.instruments + ) e0 = mod.fit().resids.values[:, None] z2 = c_[self.model.exog.ndarray, self.model.instruments.ndarray] @@ -879,7 +938,7 @@ def durbin(self, variables=None) -> WaldTestStatistic: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -910,15 +969,15 @@ def durbin(self, variables=None) -> WaldTestStatistic: where :math:`q` is the number of variables tested. """ - null = 'All endogenous variables are exogenous' + null = "All endogenous variables are exogenous" if variables is not None: - null = 'Variables {0} are exogenous'.format(', '.join(variables)) + null = "Variables {0} are exogenous".format(", ".join(variables)) e0, e1, e2, nobs, _, _, ntested = self._endogeneity_setup(variables) stat = e1.T @ e1 - e2.T @ e2 stat /= (e0.T @ e0) / nobs - name = 'Durbin test of exogeneity' + name = "Durbin test of exogeneity" df = ntested return WaldTestStatistic(float(stat), null, df, name=name) @@ -934,7 +993,7 @@ def wu_hausman(self, variables=None) -> WaldTestStatistic: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -967,20 +1026,20 @@ def wu_hausman(self, variables=None) -> WaldTestStatistic: :math:`v = n - n_{endog} - n_{exog} - q`. The test statistic has a :math:`F_{q, v}` distribution. """ - null = 'All endogenous variables are exogenous' + null = "All endogenous variables are exogenous" if variables is not None: - null = 'Variables {0} are exogenous'.format(', '.join(variables)) + null = "Variables {0} are exogenous".format(", ".join(variables)) e0, e1, e2, nobs, nexog, nendog, ntested = self._endogeneity_setup(variables) df = ntested df_denom = nobs - nexog - nendog - ntested - delta = (e1.T @ e1 - e2.T @ e2) + delta = e1.T @ e1 - e2.T @ e2 stat = delta / df stat /= (e0.T @ e0 - delta) / df_denom stat = float(stat) - name = 'Wu-Hausman test of exogeneity' + name = "Wu-Hausman test of exogeneity" return WaldTestStatistic(stat, null, df, df_denom, name=name) @cached_property @@ -990,7 +1049,7 @@ def wooldridge_score(self) -> WaldTestStatistic: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -1020,11 +1079,11 @@ def wooldridge_score(self) -> WaldTestStatistic: r = annihilate(self.model.endog.ndarray, self.model._z) nobs = e.shape[0] r = annihilate(r, self.model._x) - res = _OLS(ones((nobs, 1)), r * e).fit(cov_type='unadjusted') + res = _OLS(ones((nobs, 1)), r * e).fit(cov_type="unadjusted") stat = res.nobs - res.resid_ss df = self.model.endog.shape[1] - null = 'Endogenous variables are exogenous' - name = 'Wooldridge\'s score test of exogeneity' + null = "Endogenous variables are exogenous" + name = "Wooldridge's score test of exogeneity" return WaldTestStatistic(stat, null, df, name=name) @cached_property @@ -1034,7 +1093,7 @@ def wooldridge_regression(self) -> WaldTestStatistic: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -1057,6 +1116,7 @@ def wooldridge_regression(self) -> WaldTestStatistic: identical to the covariance estimator used with ``fit``. """ from linearmodels.iv.model import _OLS + r = annihilate(self.model.endog.ndarray, self.model._z) augx = c_[self.model._x, r] mod = _OLS(self.model.dependent, augx) @@ -1066,8 +1126,8 @@ def wooldridge_regression(self) -> WaldTestStatistic: test_cov = res.cov.values[norig:, norig:] stat = test_params.T @ inv(test_cov) @ test_params df = len(test_params) - null = 'Endogenous variables are exogenous' - name = 'Wooldridge\'s regression test of exogeneity' + null = "Endogenous variables are exogenous" + name = "Wooldridge's regression test of exogeneity" return WaldTestStatistic(stat, null, df, name=name) @cached_property @@ -1077,7 +1137,7 @@ def wooldridge_overid(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -1100,24 +1160,27 @@ def wooldridge_overid(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: The order of the instruments does not affect this test. """ from linearmodels.iv.model import _OLS + exog, endog = self.model.exog, self.model.endog instruments = self.model.instruments nobs, nendog = endog.shape ninstr = instruments.shape[1] - name = 'Wooldridge\'s score test of overidentification' + name = "Wooldridge's score test of overidentification" if ninstr - nendog == 0: - return InvalidTestStatistic('Test requires more instruments than ' - 'endogenous variables.', name=name) + return InvalidTestStatistic( + "Test requires more instruments than " "endogenous variables.", + name=name, + ) endog_hat = proj(endog.ndarray, c_[exog.ndarray, instruments.ndarray]) - q = instruments.ndarray[:, :(ninstr - nendog)] + q = instruments.ndarray[:, : (ninstr - nendog)] q_res = annihilate(q, c_[self.model.exog.ndarray, endog_hat]) test_functions = q_res * self.resids.values[:, None] - res = _OLS(ones((nobs, 1)), test_functions).fit(cov_type='unadjusted') + res = _OLS(ones((nobs, 1)), test_functions).fit(cov_type="unadjusted") stat = res.nobs * res.rsquared df = ninstr - nendog - null = 'Model is not overidentified.' + null = "Model is not overidentified." return WaldTestStatistic(stat, null, df, name=name) @cached_property @@ -1127,7 +1190,7 @@ def anderson_rubin(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -1144,13 +1207,15 @@ def anderson_rubin(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: """ nobs, ninstr = self.model.instruments.shape nendog = self.model.endog.shape[1] - name = 'Anderson-Rubin test of overidentification' + name = "Anderson-Rubin test of overidentification" if ninstr - nendog == 0: - return InvalidTestStatistic('Test requires more instruments than ' - 'endogenous variables.', name=name) + return InvalidTestStatistic( + "Test requires more instruments than " "endogenous variables.", + name=name, + ) stat = nobs * log(self._liml_kappa) df = ninstr - nendog - null = 'The model is not overidentified.' + null = "The model is not overidentified." return WaldTestStatistic(stat, null, df, name=name) @cached_property @@ -1160,7 +1225,7 @@ def basmann_f(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -1177,14 +1242,16 @@ def basmann_f(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: """ nobs, ninstr = self.model.instruments.shape nendog, nexog = self.model.endog.shape[1], self.model.exog.shape[1] - name = 'Basmann\' F test of overidentification' + name = "Basmann' F test of overidentification" if ninstr - nendog == 0: - return InvalidTestStatistic('Test requires more instruments than ' - 'endogenous variables.', name=name) + return InvalidTestStatistic( + "Test requires more instruments than " "endogenous variables.", + name=name, + ) df = ninstr - nendog df_denom = nobs - (nexog + ninstr) stat = (self._liml_kappa - 1) * df_denom / df - null = 'The model is not overidentified.' + null = "The model is not overidentified." return WaldTestStatistic(stat, null, df, df_denom=df_denom, name=name) @@ -1202,11 +1269,11 @@ class IVGMMResults(_CommonIVResults): def __init__(self, results, model): super(IVGMMResults, self).__init__(results, model) - self._weight_mat = results['weight_mat'] - self._weight_type = results['weight_type'] - self._weight_config = results['weight_config'] - self._iterations = results['iterations'] - self._j_stat = results['j_stat'] + self._weight_mat = results["weight_mat"] + self._weight_type = results["weight_type"] + self._weight_config = results["weight_config"] + self._iterations = results["iterations"] + self._j_stat = results["j_stat"] @property def weight_matrix(self) -> ndarray: @@ -1235,7 +1302,7 @@ def j_stat(self) -> Union[InvalidTestStatistic, WaldTestStatistic]: Returns ------- - j : WaldTestStatistic + WaldTestStatistic J statistic test of overidentifying restrictions Notes @@ -1268,7 +1335,7 @@ def c_stat(self, variables=None) -> WaldTestStatistic: Returns ------- - t : WaldTestStatistic + WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes @@ -1308,15 +1375,16 @@ def c_stat(self, variables=None) -> WaldTestStatistic: exog_e = c_[exog.ndarray, endog.ndarray] nobs = exog_e.shape[0] endog_e = empty((nobs, 0)) - null = 'All endogenous variables are exogenous' + null = "All endogenous variables are exogenous" else: if not isinstance(variables, list): variables = [variables] exog_e = c_[exog.ndarray, endog.pandas[variables].values] ex = [c for c in endog.pandas if c not in variables] endog_e = endog.pandas[ex].values - null = 'Variables {0} are exogenous'.format(', '.join(variables)) + null = "Variables {0} are exogenous".format(", ".join(variables)) from linearmodels.iv import IVGMM + mod = IVGMM(dependent, exog_e, endog_e, instruments) res_e = mod.fit(cov_type=self.cov_type, **self.cov_config) j_e = res_e.j_stat.stat @@ -1331,7 +1399,7 @@ def c_stat(self, variables=None) -> WaldTestStatistic: stat = j_e - j_c df = exog_e.shape[1] - exog.shape[1] - return WaldTestStatistic(stat, null, df, name='C-statistic') + return WaldTestStatistic(stat, null, df, name="C-statistic") AnyResult = Union[IVResults, IVGMMResults, OLSResults] @@ -1350,26 +1418,31 @@ class IVModelComparison(_ModelComparison): Estimator precision estimator to include in the comparison output. Default is 'tstats'. """ + _supported = (IVResults, IVGMMResults, OLSResults) - def __init__(self, results: Union[List[AnyResult], Dict[str, AnyResult]], *, - precision: str = 'tstats'): + def __init__( + self, + results: Union[List[AnyResult], Dict[str, AnyResult]], + *, + precision: str = "tstats" + ): super(IVModelComparison, self).__init__(results, precision=precision) @property def rsquared_adj(self) -> float: """Sample-size adjusted coefficients of determination (R**2)""" - return self._get_property('rsquared_adj') + return self._get_property("rsquared_adj") @property def estimator_method(self) -> str: """Estimation methods""" - return self._get_property('_method') + return self._get_property("_method") @property def cov_estimator(self) -> str: """Covariance estimator descriptions""" - return self._get_property('cov_estimator') + return self._get_property("cov_estimator") @property def summary(self) -> Summary: @@ -1380,16 +1453,36 @@ def summary(self) -> Summary: """ smry = Summary() models = list(self._results.keys()) - title = 'Model Comparison' - stubs = ['Dep. Variable', 'Estimator', 'No. Observations', 'Cov. Est.', 'R-squared', - 'Adj. R-squared', 'F-statistic', 'P-value (F-stat)'] - dep_name = OrderedDict() # type: Dict[str, Union[IVResults, IVGMMResults, OLSResults]] + title = "Model Comparison" + stubs = [ + "Dep. Variable", + "Estimator", + "No. Observations", + "Cov. Est.", + "R-squared", + "Adj. R-squared", + "F-statistic", + "P-value (F-stat)", + ] + dep_name = ( + OrderedDict() + ) # type: Dict[str, Union[IVResults, IVGMMResults, OLSResults]] for key in self._results: dep_name[key] = self._results[key].model.dependent.cols[0] dep_name = Series(dep_name) - vals = concat([dep_name, self.estimator_method, self.nobs, self.cov_estimator, - self.rsquared, self.rsquared_adj, self.f_statistic], 1) + vals = concat( + [ + dep_name, + self.estimator_method, + self.nobs, + self.cov_estimator, + self.rsquared, + self.rsquared_adj, + self.f_statistic, + ], + 1, + ) vals = [[i for i in v] for v in vals.T.values] vals[2] = [str(v) for v in vals[2]] for i in range(4, len(vals)): @@ -1404,11 +1497,11 @@ def summary(self) -> Summary: precision_fmt = [] for v in precision.values[i]: v_str = _str(v) - v_str = '({0})'.format(v_str) if v_str.strip() else v_str + v_str = "({0})".format(v_str) if v_str.strip() else v_str precision_fmt.append(v_str) params_fmt.append(precision_fmt) params_stub.append(params.index[i]) - params_stub.append(' ') + params_stub.append(" ") vals = table_concat((vals, params_fmt)) stubs = stub_concat((stubs, params_stub)) @@ -1419,33 +1512,35 @@ def summary(self) -> Summary: all_instr.append(res.model.instruments.cols) ninstr = max(map(len, all_instr)) instruments = [] - instrument_stub = ['Instruments'] + instrument_stub = ["Instruments"] for i in range(ninstr): if i > 0: - instrument_stub.append('') + instrument_stub.append("") row = [] for j in range(len(self._results)): instr = all_instr[j] if len(instr) > i: row.append(instr[i]) else: - row.append('') + row.append("") instruments.append(row) if instruments: vals = table_concat((vals, instruments)) stubs = stub_concat((stubs, instrument_stub)) txt_fmt = default_txt_fmt.copy() - txt_fmt['data_aligns'] = 'r' - txt_fmt['header_align'] = 'r' - table = SimpleTable(vals, headers=models, title=title, stubs=stubs, txt_fmt=txt_fmt) + txt_fmt["data_aligns"] = "r" + txt_fmt["header_align"] = "r" + table = SimpleTable( + vals, headers=models, title=title, stubs=stubs, txt_fmt=txt_fmt + ) smry.tables.append(table) prec_type = self._PRECISION_TYPES[self._precision] - smry.add_extra_txt(['{0} reported in parentheses'.format(prec_type)]) + smry.add_extra_txt(["{0} reported in parentheses".format(prec_type)]) return smry -def compare(results, *, precision='tstats') -> IVModelComparison: +def compare(results, *, precision="tstats") -> IVModelComparison: """ Compare the results of multiple models @@ -1460,6 +1555,7 @@ def compare(results, *, precision='tstats') -> IVModelComparison: Returns ------- - comparison : IVModelComparison + IVModelComparison + The model comparison object. """ return IVModelComparison(results, precision=precision) diff --git a/linearmodels/panel/__init__.py b/linearmodels/panel/__init__.py index 9d8febe1d3..60536bba79 100644 --- a/linearmodels/panel/__init__.py +++ b/linearmodels/panel/__init__.py @@ -3,5 +3,12 @@ RandomEffects) from linearmodels.panel.results import compare -__all__ = ['PanelOLS', 'PooledOLS', 'RandomEffects', 'FirstDifferenceOLS', 'BetweenOLS', - 'FamaMacBeth', 'compare'] +__all__ = [ + "PanelOLS", + "PooledOLS", + "RandomEffects", + "FirstDifferenceOLS", + "BetweenOLS", + "FamaMacBeth", + "compare", +] diff --git a/linearmodels/panel/covariance.py b/linearmodels/panel/covariance.py index 570b5eb5f1..3e06b7ef6b 100644 --- a/linearmodels/panel/covariance.py +++ b/linearmodels/panel/covariance.py @@ -1,14 +1,19 @@ -from property_cached import cached_property import numpy as np from numpy.linalg import inv from pandas import DataFrame +from property_cached import cached_property from linearmodels.iv.covariance import (CLUSTER_ERR, KERNEL_LOOKUP, _cov_cluster, _cov_kernel, kernel_optimal_bandwidth) -__all__ = ['HomoskedasticCovariance', 'HeteroskedasticCovariance', - 'ClusteredCovariance', 'DriscollKraay', 'CovarianceManager'] +__all__ = [ + "HomoskedasticCovariance", + "HeteroskedasticCovariance", + "ClusteredCovariance", + "DriscollKraay", + "CovarianceManager", +] class HomoskedasticCovariance(object): @@ -56,7 +61,9 @@ class HomoskedasticCovariance(object): ``True``. """ - def __init__(self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_df=0): + def __init__( + self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_df=0 + ): self._y = y self._x = x self._params = params @@ -69,7 +76,7 @@ def __init__(self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_ if debiased: self._nobs_eff -= self._nvar self._scale = self._nobs / self._nobs_eff - self._name = 'Unadjusted' + self._name = "Unadjusted" @property def name(self): @@ -146,10 +153,13 @@ class HeteroskedasticCovariance(HomoskedasticCovariance): ``True``. """ - def __init__(self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_df=0): - super(HeteroskedasticCovariance, self).__init__(y, x, params, entity_ids, time_ids, - debiased=debiased, extra_df=extra_df) - self._name = 'Robust' + def __init__( + self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_df=0 + ): + super(HeteroskedasticCovariance, self).__init__( + y, x, params, entity_ids, time_ids, debiased=debiased, extra_df=extra_df + ) + self._name = "Robust" @cached_property def cov(self): @@ -196,7 +206,7 @@ class ClusteredCovariance(HomoskedasticCovariance): Returns ------- - cov : array + ndarray Estimated parameter covariance Notes @@ -231,23 +241,34 @@ class ClusteredCovariance(HomoskedasticCovariance): observations. """ - def __init__(self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_df=0, - clusters=None, - group_debias=False): - super(ClusteredCovariance, self).__init__(y, x, params, entity_ids, time_ids, - debiased=debiased, extra_df=extra_df) + def __init__( + self, + y, + x, + params, + entity_ids, + time_ids, + *, + debiased=False, + extra_df=0, + clusters=None, + group_debias=False + ): + super(ClusteredCovariance, self).__init__( + y, x, params, entity_ids, time_ids, debiased=debiased, extra_df=extra_df + ) if clusters is None: clusters = np.arange(self._x.shape[0]) clusters = np.asarray(clusters).squeeze() self._group_debias = group_debias dim1 = 1 if clusters.ndim == 1 else clusters.shape[1] if clusters.ndim > 2 or dim1 > 2: - raise ValueError('Only 1 or 2-way clustering supported.') + raise ValueError("Only 1 or 2-way clustering supported.") nobs = y.shape[0] if clusters.shape[0] != nobs: raise ValueError(CLUSTER_ERR.format(nobs, clusters.shape[0])) self._clusters = clusters - self._name = 'Clustered' + self._name = "Clustered" @staticmethod def _calc_group_debias(clusters): @@ -362,11 +383,23 @@ class DriscollKraay(HomoskedasticCovariance): # TODO: Test - def __init__(self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_df=0, - kernel='newey-west', bandwidth=None): - super(DriscollKraay, self).__init__(y, x, params, entity_ids, time_ids, - debiased=debiased, extra_df=extra_df) - self._name = 'Driscoll-Kraay' + def __init__( + self, + y, + x, + params, + entity_ids, + time_ids, + *, + debiased=False, + extra_df=0, + kernel="newey-west", + bandwidth=None + ): + super(DriscollKraay, self).__init__( + y, x, params, entity_ids, time_ids, debiased=debiased, extra_df=extra_df + ) + self._name = "Driscoll-Kraay" self._kernel = kernel self._bandwidth = bandwidth @@ -464,11 +497,23 @@ class ACCovariance(HomoskedasticCovariance): # TODO: Docstring - def __init__(self, y, x, params, entity_ids, time_ids, *, debiased=False, extra_df=0, - kernel='newey-west', bandwidth=None): - super(ACCovariance, self).__init__(y, x, params, entity_ids, time_ids, - debiased=debiased, extra_df=extra_df) - self._name = 'Autocorrelation Rob. Cov.' + def __init__( + self, + y, + x, + params, + entity_ids, + time_ids, + *, + debiased=False, + extra_df=0, + kernel="newey-west", + bandwidth=None + ): + super(ACCovariance, self).__init__( + y, x, params, entity_ids, time_ids, debiased=debiased, extra_df=extra_df + ) + self._name = "Autocorrelation Rob. Cov." self._kernel = kernel self._bandwidth = bandwidth @@ -511,17 +556,19 @@ def cov(self): class CovarianceManager(object): - COVARIANCE_ESTIMATORS = {'unadjusted': HomoskedasticCovariance, - 'conventional': HomoskedasticCovariance, - 'homoskedastic': HomoskedasticCovariance, - 'robust': HeteroskedasticCovariance, - 'heteroskedastic': HeteroskedasticCovariance, - 'clustered': ClusteredCovariance, - 'driscoll-kraay': DriscollKraay, - 'dk': DriscollKraay, - 'kernel': DriscollKraay, - 'ac': ACCovariance, - 'autocorrelated': ACCovariance} + COVARIANCE_ESTIMATORS = { + "unadjusted": HomoskedasticCovariance, + "conventional": HomoskedasticCovariance, + "homoskedastic": HomoskedasticCovariance, + "robust": HeteroskedasticCovariance, + "heteroskedastic": HeteroskedasticCovariance, + "clustered": ClusteredCovariance, + "driscoll-kraay": DriscollKraay, + "dk": DriscollKraay, + "kernel": DriscollKraay, + "ac": ACCovariance, + "autocorrelated": ACCovariance, + } def __init__(self, estimator, *cov_estimators): self._estimator = estimator @@ -529,11 +576,13 @@ def __init__(self, estimator, *cov_estimators): def __getitem__(self, item): if item not in self.COVARIANCE_ESTIMATORS: - raise KeyError('Unknown covariance estimator type.') + raise KeyError("Unknown covariance estimator type.") cov_est = self.COVARIANCE_ESTIMATORS[item] if cov_est not in self._supported: - raise ValueError('Requested covariance estimator is not supported ' - 'for the {0}.'.format(self._estimator)) + raise ValueError( + "Requested covariance estimator is not supported " + "for the {0}.".format(self._estimator) + ) return cov_est @@ -561,9 +610,11 @@ class FamaMacBethCovariance(HomoskedasticCovariance): """ def __init__(self, y, x, params, all_params, *, debiased=False): - super(FamaMacBethCovariance, self).__init__(y, x, params, None, None, debiased=debiased) + super(FamaMacBethCovariance, self).__init__( + y, x, params, None, None, debiased=debiased + ) self._all_params = all_params - self._name = 'Fama-MacBeth Std Cov' + self._name = "Fama-MacBeth Std Cov" @cached_property def cov(self): @@ -571,7 +622,7 @@ def cov(self): e = self._all_params - self._params.T e = e[np.all(np.isfinite(e), 1)] nobs = e.shape[0] - cov = (e.T @ e / nobs) + cov = e.T @ e / nobs return cov / (nobs - int(bool(self._debiased))) @@ -602,11 +653,21 @@ class FamaMacBethKernelCovariance(FamaMacBethCovariance): Covariance is a Kernel covariance of all estimated parameters. """ - def __init__(self, y, x, params, all_params, *, debiased=False, kernel='newey-west', - bandwidth=None): - super(FamaMacBethKernelCovariance, self).__init__(y, x, params, all_params, - debiased=debiased) - self._name = 'Fama-MacBeth Kernel Cov' + def __init__( + self, + y, + x, + params, + all_params, + *, + debiased=False, + kernel="newey-west", + bandwidth=None + ): + super(FamaMacBethKernelCovariance, self).__init__( + y, x, params, all_params, debiased=debiased + ) + self._name = "Fama-MacBeth Kernel Cov" self._bandwidth = bandwidth self._kernel = kernel diff --git a/linearmodels/panel/data.py b/linearmodels/panel/data.py index e4f1496c4f..7418647ee0 100644 --- a/linearmodels/panel/data.py +++ b/linearmodels/panel/data.py @@ -12,7 +12,7 @@ from linearmodels.utility import ensure_unique_column, panel_to_frame -__all__ = ['PanelData'] +__all__ = ["PanelData"] class _Panel(object): @@ -35,14 +35,15 @@ def __init__(self, df): index = df.index self._major_axis = Index(index.levels[1][get_codes(index)[1]]).unique() self._minor_axis = Index(index.levels[0][get_codes(index)[0]]).unique() - self._full_index = MultiIndex.from_product([self._minor_axis, - self._major_axis]) + self._full_index = MultiIndex.from_product([self._minor_axis, self._major_axis]) new_df = df.reindex(self._full_index) new_df.index.names = df.index.names self._frame = new_df i, j, k = len(self._items), len(self._major_axis), len(self.minor_axis) self._shape = (i, j, k) - self._values = np.swapaxes(np.reshape(np.asarray(new_df).copy().T, (i, k, j)), 1, 2) + self._values = np.swapaxes( + np.reshape(np.asarray(new_df).copy().T, (i, k, j)), 1, 2 + ) @classmethod def from_array(cls, values, items, major_axis, minor_axis): @@ -81,11 +82,11 @@ def to_frame(self): def convert_columns(s, drop_first): if is_string_dtype(s.dtype) and s.map(is_string_like).all(): - s = s.astype('category') + s = s.astype("category") if is_categorical(s): out = get_dummies(s, drop_first=drop_first) - out.columns = [str(s.name) + '.' + str(c) for c in out] + out.columns = [str(s.name) + "." + str(c) for c in out] return out return s @@ -140,13 +141,15 @@ class PanelData(object): DataFrame does not have 2 levels """ - def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True, copy=True): + def __init__( + self, x, var_name="x", convert_dummies=True, drop_first=True, copy=True + ): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first self._panel = None self._shape = None - index_names = ['entity', 'time'] + index_names = ["entity", "time"] if isinstance(x, PanelData): x = x.dataframe self._original = x @@ -154,9 +157,10 @@ def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True, copy= if not isinstance(x, (Series, DataFrame, np.ndarray)): try: from xarray import DataArray + if isinstance(x, DataArray): if x.ndim not in (2, 3): - raise ValueError('Only 2-d or 3-d DataArrays are supported') + raise ValueError("Only 2-d or 3-d DataArrays are supported") if x.ndim == 2: x = x.to_pandas() else: @@ -171,14 +175,15 @@ def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True, copy= if isinstance(x, Series) and isinstance(x.index, MultiIndex): x = DataFrame(x) elif isinstance(x, Series): - raise ValueError('Series can only be used with a 2-level MultiIndex') + raise ValueError("Series can only be used with a 2-level MultiIndex") if isinstance(x, DataFrame): if isinstance(x, DataFrame): if isinstance(x.index, MultiIndex): if len(x.index.levels) != 2: - raise ValueError('DataFrame input must have a ' - 'MultiIndex with 2 levels') + raise ValueError( + "DataFrame input must have a " "MultiIndex with 2 levels" + ) if isinstance(self._original, (DataFrame, PanelData, Series)): for i in range(2): index_names[i] = x.index.levels[i].name or index_names[i] @@ -191,33 +196,36 @@ def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True, copy= self._frame = x.swapaxes(1, 2).to_frame(filter_observations=False) elif isinstance(x, np.ndarray): if x.ndim not in (2, 3): - raise ValueError('2 or 3-d array required for numpy input') + raise ValueError("2 or 3-d array required for numpy input") if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape - var_str = var_name + '.{0:0>' + str(int(np.log10(k) + .01)) + '}' + var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}" variables = [var_name] if k == 1 else [var_str.format(i) for i in range(k)] - entity_str = 'entity.{0:0>' + str(int(np.log10(n) + .01)) + '}' + entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}" entities = [entity_str.format(i) for i in range(n)] time = list(range(t)) x = x.astype(np.float64, copy=False) - panel = _Panel.from_array(x, items=variables, major_axis=time, - minor_axis=entities) + panel = _Panel.from_array( + x, items=variables, major_axis=time, minor_axis=entities + ) self._fake_panel = panel self._frame = panel.to_frame() else: - raise TypeError('Only ndarrays, DataFrames or DataArrays are ' - 'supported') + raise TypeError("Only ndarrays, DataFrames or DataArrays are " "supported") if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64, copy=False) time_index = Series(self._frame.index.levels[1]) - if not (is_numeric_dtype(time_index.dtype) or - is_datetime64_any_dtype(time_index.dtype)): - raise ValueError('The index on the time dimension must be either ' - 'numeric or date-like') + if not ( + is_numeric_dtype(time_index.dtype) + or is_datetime64_any_dtype(time_index.dtype) + ): + raise ValueError( + "The index on the time dimension must be either " "numeric or date-like" + ) # self._k, self._t, self._n = self.panel.shape self._k, self._t, self._n = self.shape self._frame.index.set_names(index_names, inplace=True) @@ -320,7 +328,7 @@ def entity_ids(self): Returns ------- - id : ndarray + ndarray 2d array containing entity ids corresponding dataframe view """ return np.asarray(get_codes(self._frame.index)[0])[:, None] @@ -332,16 +340,17 @@ def time_ids(self): Returns ------- - id : ndarray + ndarray 2d array containing time ids corresponding dataframe view """ return np.asarray(get_codes(self._frame.index)[1])[:, None] def _demean_both_low_mem(self, weights): - groups = PanelData(DataFrame(np.c_[self.entity_ids, self.time_ids], - index=self._frame.index), - convert_dummies=False, - copy=False) + groups = PanelData( + DataFrame(np.c_[self.entity_ids, self.time_ids], index=self._frame.index), + convert_dummies=False, + copy=False, + ) return self.general_demean(groups, weights=weights) def _demean_both(self, weights): @@ -354,11 +363,11 @@ def _demean_both(self, weights): Weights to use in demeaning """ if self.nentity > self.nobs: - group = 'entity' - dummy = 'time' + group = "entity" + dummy = "time" else: - group = 'time' - dummy = 'entity' + group = "time" + dummy = "entity" e = self.demean(group, weights=weights) d = self.dummies(dummy, drop_first=True) d.index = e.index @@ -366,8 +375,7 @@ def _demean_both(self, weights): d = d.values2d e = e.values2d resid = e - d @ lstsq(d, e)[0] - resid = DataFrame(resid, index=self._frame.index, - columns=self._frame.columns) + resid = DataFrame(resid, index=self._frame.index, columns=self._frame.columns) return PanelData(resid) @@ -384,7 +392,7 @@ def general_demean(self, groups, weights=None): Returns ------- - demeaned : PanelData + PanelData Weighted, demeaned data according to groups Notes @@ -394,20 +402,24 @@ def general_demean(self, groups, weights=None): if not isinstance(groups, PanelData): groups = PanelData(groups) if weights is None: - weights = PanelData(DataFrame(np.ones((self._frame.shape[0], 1)), - index=self.index, - columns=['weights'])) + weights = PanelData( + DataFrame( + np.ones((self._frame.shape[0], 1)), + index=self.index, + columns=["weights"], + ) + ) weights = weights.values2d groups = groups.values2d.astype(np.int64, copy=False) weight_sum = {} def weighted_group_mean(df, weights, root_w, level): - num = (root_w * df).groupby(level=level).transform('sum') + num = (root_w * df).groupby(level=level).transform("sum") if level in weight_sum: denom = weight_sum[level] else: - denom = weights.groupby(level=level).transform('sum') + denom = weights.groupby(level=level).transform("sum") weight_sum[level] = denom return np.asarray(num) / np.asarray(denom) @@ -452,7 +464,7 @@ def demean_pass(frame, weights, root_w): return PanelData(current) - def demean(self, group='entity', weights=None, return_panel=True, low_memory=False): + def demean(self, group="entity", weights=None, return_panel=True, low_memory=False): """ Demeans data by either entity or time group @@ -472,7 +484,7 @@ def demean(self, group='entity', weights=None, return_panel=True, low_memory=Fal Returns ------- - demeaned : PanelData + PanelData Demeaned data according to type Notes @@ -481,17 +493,17 @@ def demean(self, group='entity', weights=None, return_panel=True, low_memory=Fal the square root of the weights so that they can be used in WLS estimation. """ - if group not in ('entity', 'time', 'both'): + if group not in ("entity", "time", "both"): raise ValueError - if group == 'both': + if group == "both": if not low_memory: return self._demean_both(weights) else: return self._demean_both_low_mem(weights) - level = 0 if group == 'entity' else 1 + level = 0 if group == "entity" else 1 if weights is None: - group_mu = self._frame.groupby(level=level).transform('mean') + group_mu = self._frame.groupby(level=level).transform("mean") out = self._frame - group_mu if not return_panel: return np.asarray(out) @@ -500,9 +512,9 @@ def demean(self, group='entity', weights=None, return_panel=True, low_memory=Fal w = weights.values2d frame = self._frame.copy() frame = w * frame - weighted_sum = frame.groupby(level=level).transform('sum') + weighted_sum = frame.groupby(level=level).transform("sum") frame.iloc[:, :] = w - sum_weights = frame.groupby(level=level).transform('sum') + sum_weights = frame.groupby(level=level).transform("sum") group_mu = weighted_sum / sum_weights out = np.sqrt(w) * (self._frame - group_mu) if not return_panel: @@ -510,15 +522,21 @@ def demean(self, group='entity', weights=None, return_panel=True, low_memory=Fal return PanelData(out) def __str__(self): - return self.__class__.__name__ + '\n' + str(self._frame) + return self.__class__.__name__ + "\n" + str(self._frame) def __repr__(self): - return self.__str__() + '\n' + self.__class__.__name__ + ' object, id: ' + hex(id(self)) + return ( + self.__str__() + + "\n" + + self.__class__.__name__ + + " object, id: " + + hex(id(self)) + ) def _repr_html_(self): - return self.__class__.__name__ + '
' + self._frame._repr_html_() + return self.__class__.__name__ + "
" + self._frame._repr_html_() - def count(self, group='entity'): + def count(self, group="entity"): """ Count number of observations by entity or time @@ -529,11 +547,11 @@ def count(self, group='entity'): Returns ------- - count : DataFrame + DataFrame Counts according to type. Either (entity by var) or (time by var) """ - level = 0 if group == 'entity' else 1 - reindex = self.entities if group == 'entity' else self.time + level = 0 if group == "entity" else 1 + reindex = self.entities if group == "entity" else self.time out = self._frame.groupby(level=level).count() return out.reindex(reindex) @@ -545,10 +563,14 @@ def index(self): def copy(self): """Return a deep copy""" - return PanelData(self._frame.copy(), var_name=self._var_name, - convert_dummies=self._convert_dummies, drop_first=self._drop_first) - - def mean(self, group='entity', weights=None): + return PanelData( + self._frame.copy(), + var_name=self._var_name, + convert_dummies=self._convert_dummies, + drop_first=self._drop_first, + ) + + def mean(self, group="entity", weights=None): """ Compute data mean by either entity or time group @@ -561,10 +583,10 @@ def mean(self, group='entity', weights=None): Returns ------- - mean : DataFrame + DataFrame Data mean according to type. Either (entity by var) or (time by var) """ - level = 0 if group == 'entity' else 1 + level = 0 if group == "entity" else 1 if weights is None: mu = self._frame.groupby(level=level).mean() else: @@ -576,7 +598,7 @@ def mean(self, group='entity', weights=None): sum_weights = frame.groupby(level=level).sum() mu = weighted_sum / sum_weights - reindex = self.entities if group == 'entity' else self.time + reindex = self.entities if group == "entity" else self.time out = mu.reindex(reindex) return out @@ -587,14 +609,19 @@ def first_difference(self): Returns ------- - diffs : PanelData + PanelData Differenced values """ diffs = self.panel.values diffs = diffs[:, 1:] - diffs[:, :-1] - diffs = panel_to_frame(diffs, self.panel.items, self.panel.major_axis[1:], - self.panel.minor_axis, True) - diffs = diffs.reindex(self._frame.index).dropna(how='any') + diffs = panel_to_frame( + diffs, + self.panel.items, + self.panel.major_axis[1:], + self.panel.minor_axis, + True, + ) + diffs = diffs.reindex(self._frame.index).dropna(how="any") return PanelData(diffs) @staticmethod @@ -610,7 +637,7 @@ def _minimize_multiindex(df): df.index.names = orig_names return df - def dummies(self, group='entity', drop_first=False): + def dummies(self, group="entity", drop_first=False): """ Generate entity or time dummies @@ -624,15 +651,15 @@ def dummies(self, group='entity', drop_first=False): Returns ------- - dummies : DataFrame + DataFrame Dummy variables """ - if group not in ('entity', 'time'): + if group not in ("entity", "time"): raise ValueError - axis = 0 if group == 'entity' else 1 + axis = 0 if group == "entity" else 1 labels = get_codes(self._frame.index) levels = self._frame.index.levels cat = Categorical(levels[axis][labels[axis]]) dummies = get_dummies(cat, drop_first=drop_first) - cols = self.entities if group == 'entity' else self.time + cols = self.entities if group == "entity" else self.time return dummies[[c for c in cols if c in dummies]].astype(np.float64, copy=False) diff --git a/linearmodels/panel/model.py b/linearmodels/panel/model.py index 1c6a7d5bbb..5a333d99a0 100644 --- a/linearmodels/panel/model.py +++ b/linearmodels/panel/model.py @@ -18,8 +18,10 @@ from linearmodels.panel.data import PanelData from linearmodels.panel.results import (PanelEffectsResults, PanelResults, RandomEffectsResults) -from linearmodels.panel.utility import (check_absorbed, dummy_matrix, in_2core_graph, - not_absorbed, AbsorbingEffectWarning, absorbing_warn_msg) +from linearmodels.panel.utility import (AbsorbingEffectWarning, + absorbing_warn_msg, check_absorbed, + dummy_matrix, in_2core_graph, + not_absorbed) from linearmodels.utility import (AttrDict, InapplicableTestStatistic, InferenceUnavailableWarning, InvalidTestStatistic, MemoryWarning, @@ -31,7 +33,7 @@ def panel_structure_stats(ids, name): bc = np.bincount(ids) - index = ['mean', 'median', 'max', 'min', 'total'] + index = ["mean", "median", "max", "min", "total"] out = [bc.mean(), np.median(bc), bc.max(), bc.min(), bc.shape[0]] return pd.Series(out, index=index, name=name) @@ -57,19 +59,19 @@ class PanelFormulaParser(object): def __init__(self, formula, data, eval_env=2): self._formula = formula self._data = PanelData(data, convert_dummies=False, copy=False) - self._na_action = NAAction(on_NA='raise', NA_types=[]) + self._na_action = NAAction(on_NA="raise", NA_types=[]) self._eval_env = eval_env self._dependent = self._exog = None self._parse() def _parse(self): - parts = self._formula.split('~') - parts[1] = ' 0 + ' + parts[1] - cln_formula = '~'.join(parts) + parts = self._formula.split("~") + parts[1] = " 0 + " + parts[1] + cln_formula = "~".join(parts) mod_descr = ModelDesc.from_formula(cln_formula) rm_list = [] - effects = {'EntityEffects': False, 'FixedEffects': False, 'TimeEffects': False} + effects = {"EntityEffects": False, "FixedEffects": False, "TimeEffects": False} for term in mod_descr.rhs_termlist: if term.name() in effects: effects[term.name()] = True @@ -77,13 +79,13 @@ def _parse(self): for term in rm_list: mod_descr.rhs_termlist.remove(term) - if effects['EntityEffects'] and effects['FixedEffects']: - raise ValueError('Cannot use both FixedEffects and EntityEffects') - self._entity_effect = effects['EntityEffects'] or effects['FixedEffects'] - self._time_effect = effects['TimeEffects'] + if effects["EntityEffects"] and effects["FixedEffects"]: + raise ValueError("Cannot use both FixedEffects and EntityEffects") + self._entity_effect = effects["EntityEffects"] or effects["FixedEffects"] + self._time_effect = effects["TimeEffects"] cln_formula = mod_descr.describe() - self._lhs, self._rhs = map(lambda s: s.strip(), cln_formula.split('~')) - self._lhs = '0 + ' + self._lhs + self._lhs, self._rhs = map(lambda s: s.strip(), cln_formula.split("~")) + self._lhs = "0 + " + self._lhs @property def entity_effect(self): @@ -115,14 +117,24 @@ def data(self): @property def dependent(self): """DataFrame containing the dependent variable""" - return dmatrix(self._lhs, self._data.dataframe, eval_env=self._eval_env, - return_type='dataframe', NA_action=self._na_action) + return dmatrix( + self._lhs, + self._data.dataframe, + eval_env=self._eval_env, + return_type="dataframe", + NA_action=self._na_action, + ) @property def exog(self): """DataFrame containing the exogenous variables""" - out = dmatrix(self._rhs, self._data.dataframe, eval_env=self._eval_env, - return_type='dataframe', NA_action=self._na_action) + out = dmatrix( + self._rhs, + self._data.dataframe, + eval_env=self._eval_env, + return_type="dataframe", + NA_action=self._na_action, + ) return out @@ -130,9 +142,15 @@ class AmbiguityError(Exception): pass -__all__ = ['PanelOLS', 'PooledOLS', 'RandomEffects', 'FirstDifferenceOLS', - 'BetweenOLS', 'AmbiguityError', - 'FamaMacBeth'] +__all__ = [ + "PanelOLS", + "PooledOLS", + "RandomEffects", + "FirstDifferenceOLS", + "BetweenOLS", + "AmbiguityError", + "FamaMacBeth", +] # Likely @@ -150,11 +168,11 @@ class PooledOLS(object): Parameters ---------- - dependent : array-like + dependent : array_like Dependent (left-hand-side) variable (time by entity) - exog : array-like + exog : array_like Exogenous or right-hand-side variables (variable by time by entity). - weights : array-like, optional + weights : array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual time the weight should be homoskedastic. @@ -169,8 +187,8 @@ class PooledOLS(object): """ def __init__(self, dependent, exog, *, weights=None): - self.dependent = PanelData(dependent, 'Dep') - self.exog = PanelData(exog, 'Exog') + self.dependent = PanelData(dependent, "Dep") + self.exog = PanelData(exog, "Exog") self._original_shape = self.dependent.shape self._constant = None self._formula = None @@ -178,21 +196,28 @@ def __init__(self, dependent, exog, *, weights=None): self._name = self.__class__.__name__ self.weights = self._adapt_weights(weights) self._not_null = np.ones(self.dependent.values2d.shape[0], dtype=np.bool) - self._cov_estimators = CovarianceManager(self.__class__.__name__, HomoskedasticCovariance, - HeteroskedasticCovariance, ClusteredCovariance, - DriscollKraay, ACCovariance) + self._cov_estimators = CovarianceManager( + self.__class__.__name__, + HomoskedasticCovariance, + HeteroskedasticCovariance, + ClusteredCovariance, + DriscollKraay, + ACCovariance, + ) self._original_index = self.dependent.index.copy() self._validate_data() self._singleton_index = None def __str__(self): - out = '{name} \nNum exog: {num_exog}, Constant: {has_constant}' - return out.format(name=self.__class__.__name__, - num_exog=self.exog.dataframe.shape[1], - has_constant=self.has_constant) + out = "{name} \nNum exog: {num_exog}, Constant: {has_constant}" + return out.format( + name=self.__class__.__name__, + num_exog=self.exog.dataframe.shape[1], + has_constant=self.has_constant, + ) def __repr__(self): - return self.__str__() + '\nid: ' + str(hex(id(self))) + return self.__str__() + "\nid: " + str(hex(id(self))) def reformat_clusters(self, clusters): """ @@ -200,12 +225,12 @@ def reformat_clusters(self, clusters): Parameters ---------- - clusters : array-like + clusters : array_like Values to use for variance clustering Returns ------- - reformatted : PanelData + PanelData Original data with matching axis and observation dropped where missing in the model data. @@ -213,20 +238,24 @@ def reformat_clusters(self, clusters): ----- This is exposed for testing and is not normally needed for estimation """ - clusters = PanelData(clusters, var_name='cov.cluster', convert_dummies=False) + clusters = PanelData(clusters, var_name="cov.cluster", convert_dummies=False) if clusters.shape[1:] != self._original_shape[1:]: - raise ValueError('clusters must have the same number of entities ' - 'and time periods as the model data.') + raise ValueError( + "clusters must have the same number of entities " + "and time periods as the model data." + ) clusters.drop(~self.not_null) return clusters def _info(self): """Information about panel structure""" - entity_info = panel_structure_stats(self.dependent.entity_ids.squeeze(), - 'Observations per entity') - time_info = panel_structure_stats(self.dependent.time_ids.squeeze(), - 'Observations per time period') + entity_info = panel_structure_stats( + self.dependent.entity_ids.squeeze(), "Observations per entity" + ) + time_info = panel_structure_stats( + self.dependent.time_ids.squeeze(), "Observations per time period" + ) other_info = None return entity_info, time_info, other_info @@ -237,11 +266,10 @@ def _adapt_weights(self, weights): self._is_weighted = False frame = self.dependent.dataframe.copy() frame.iloc[:, :] = 1 - frame.columns = ['weight'] + frame.columns = ["weight"] return PanelData(frame) - frame = pd.DataFrame(columns=self.dependent.entities, - index=self.dependent.time) + frame = pd.DataFrame(columns=self.dependent.entities, index=self.dependent.time) nobs, nentity = self.exog.nobs, self.exog.nentity if weights.ndim == 3 or weights.shape == (nobs, nentity): @@ -249,11 +277,15 @@ def _adapt_weights(self, weights): weights = np.squeeze(weights) if weights.shape[0] == nobs and nobs == nentity: - raise AmbiguityError('Unable to distinguish nobs form nentity since they are ' - 'equal. You must use an 2-d array to avoid ambiguity.') - if (isinstance(weights, (pd.Series, pd.DataFrame)) and - isinstance(weights.index, pd.MultiIndex) and - weights.shape[0] == self.dependent.dataframe.shape[0]): + raise AmbiguityError( + "Unable to distinguish nobs form nentity since they are " + "equal. You must use an 2-d array to avoid ambiguity." + ) + if ( + isinstance(weights, (pd.Series, pd.DataFrame)) + and isinstance(weights.index, pd.MultiIndex) + and weights.shape[0] == self.dependent.dataframe.shape[0] + ): frame = weights elif weights.shape[0] == nobs: weights = np.asarray(weights)[:, None] @@ -267,14 +299,14 @@ def _adapt_weights(self, weights): frame = self.dependent.dataframe.copy() frame.iloc[:, :] = weights[:, None] else: - raise ValueError('Weights do not have a supported shape.') + raise ValueError("Weights do not have a supported shape.") return PanelData(frame) def _check_exog_rank(self): x = self.exog.values2d rank_of_x = matrix_rank(x) if rank_of_x < x.shape[1]: - raise ValueError('exog does not have full column rank.') + raise ValueError("exog does not have full column rank.") return rank_of_x def _validate_data(self): @@ -283,16 +315,20 @@ def _validate_data(self): x = self._x = self.exog.values2d w = self._w = self.weights.values2d if y.shape[0] != x.shape[0]: - raise ValueError('dependent and exog must have the same number of ' - 'observations.') + raise ValueError( + "dependent and exog must have the same number of " "observations." + ) if y.shape[0] != w.shape[0]: - raise ValueError('weights must have the same number of ' - 'observations as dependent.') + raise ValueError( + "weights must have the same number of " "observations as dependent." + ) all_missing = np.any(np.isnan(y), axis=1) & np.all(np.isnan(x), axis=1) - missing = (np.any(np.isnan(y), axis=1) | - np.any(np.isnan(x), axis=1) | - np.any(np.isnan(w), axis=1)) + missing = ( + np.any(np.isnan(y), axis=1) + | np.any(np.isnan(x), axis=1) + | np.any(np.isnan(w), axis=1) + ) missing_warning(all_missing ^ missing) if np.any(missing): @@ -305,7 +341,7 @@ def _validate_data(self): w = self.weights.dataframe if np.any(np.asarray(w) <= 0): - raise ValueError('weights must be strictly positive.') + raise ValueError("weights must be strictly positive.") w = w / w.mean() self.weights = PanelData(w) rank_of_x = self._check_exog_rank() @@ -329,11 +365,10 @@ def _f_statistic(self, weps, y, x, root_w, df_resid): """Compute model F-statistic""" weps_const = y num_df = x.shape[1] - name = 'Model F-statistic (homoskedastic)' + name = "Model F-statistic (homoskedastic)" if self.has_constant: if num_df == 1: - return InvalidTestStatistic('Model contains only a constant', - name=name) + return InvalidTestStatistic("Model contains only a constant", name=name) num_df -= 1 weps_const = y - float((root_w.T @ y) / (root_w.T @ root_w)) @@ -343,17 +378,21 @@ def _f_statistic(self, weps, y, x, root_w, df_resid): denom = resid_ss denom_df = df_resid stat = float((num / num_df) / (denom / denom_df)) - return WaldTestStatistic(stat, null='All parameters ex. constant not zero', - df=num_df, df_denom=denom_df, name=name) + return WaldTestStatistic( + stat, + null="All parameters ex. constant not zero", + df=num_df, + df_denom=denom_df, + name=name, + ) def _f_statistic_robust(self, params, cov_est, debiased, df_resid): """Compute Wald test that all parameters are 0, ex. constant""" sel = np.ones(params.shape[0], dtype=np.bool) - name = 'Model F-statistic (robust)' + name = "Model F-statistic (robust)" def invalid_f(): - return InvalidTestStatistic('Model contains only a constant', - name=name) + return InvalidTestStatistic("Model contains only a constant", name=name) if self.has_constant: if len(sel) == 1: @@ -366,11 +405,10 @@ def deferred_f(): test_stat = test_params.T @ np.linalg.inv(test_cov) @ test_params test_stat = float(test_stat) df = sel.sum() - null = 'All parameters ex. constant not zero' + null = "All parameters ex. constant not zero" if debiased: - wald = WaldTestStatistic(test_stat / df, null, df, df_resid, - name=name) + wald = WaldTestStatistic(test_stat / df, null, df, df_resid, name=name) else: wald = WaldTestStatistic(test_stat, null, df, name=name) return wald @@ -380,10 +418,10 @@ def deferred_f(): def _prepare_between(self): """Prepare values for between estimation of R2""" weights = self.weights if self._is_weighted else None - y = self.dependent.mean('entity', weights=weights).values - x = self.exog.mean('entity', weights=weights).values + y = self.dependent.mean("entity", weights=weights).values + x = self.exog.mean("entity", weights=weights).values # Weight transformation - wcount, wmean = self.weights.count('entity'), self.weights.mean('entity') + wcount, wmean = self.weights.count("entity"), self.weights.mean("entity") wsum = wcount * wmean w = wsum.values w = w / w.mean() @@ -435,9 +473,8 @@ def _rsquared(self, params, reweight=False): # R2 - Within ############################################# weights = self.weights if self._is_weighted else None - wy = self.dependent.demean('entity', weights=weights, - return_panel=False) - wx = self.exog.demean('entity', weights=weights, return_panel=False) + wy = self.dependent.demean("entity", weights=weights, return_panel=False) + wx = self.exog.demean("entity", weights=weights, return_panel=False) weps = wy - wx @ params residual_ss = float(weps.T @ weps) total_ss = float(wy.T @ wy) @@ -453,21 +490,38 @@ def _postestimation(self, params, cov, debiased, df_resid, weps, y, x, root_w): deferred_f = self._f_statistic_robust(params, cov, debiased, df_resid) f_stat = self._f_statistic(weps, y, x, root_w, df_resid) r2o, r2w, r2b = self._rsquared(params) - f_pooled = InapplicableTestStatistic(reason='Model has no effects', - name='Pooled F-stat') + f_pooled = InapplicableTestStatistic( + reason="Model has no effects", name="Pooled F-stat" + ) entity_info, time_info, other_info = self._info() nobs = weps.shape[0] sigma2 = float(weps.T @ weps / nobs) loglik = -0.5 * nobs * (np.log(2 * np.pi) + np.log(sigma2) + 1) - res = AttrDict(params=params, deferred_cov=cov.deferred_cov, - deferred_f=deferred_f, f_stat=f_stat, - debiased=debiased, name=self._name, var_names=self.exog.vars, - r2w=r2w, r2b=r2b, r2=r2w, r2o=r2o, s2=cov.s2, - model=self, cov_type=cov.name, index=self.dependent.index, - entity_info=entity_info, time_info=time_info, other_info=other_info, - f_pooled=f_pooled, loglik=loglik, not_null=self._not_null, - original_index=self._original_index) + res = AttrDict( + params=params, + deferred_cov=cov.deferred_cov, + deferred_f=deferred_f, + f_stat=f_stat, + debiased=debiased, + name=self._name, + var_names=self.exog.vars, + r2w=r2w, + r2b=r2b, + r2=r2w, + r2o=r2o, + s2=cov.s2, + model=self, + cov_type=cov.name, + index=self.dependent.index, + entity_info=entity_info, + time_info=time_info, + other_info=other_info, + f_pooled=f_pooled, + loglik=loglik, + not_null=self._not_null, + original_index=self._original_index, + ) return res @property @@ -484,18 +538,18 @@ def from_formula(cls, formula, data, *, weights=None): ---------- formula : str Formula to transform into model. Conforms to patsy formula rules. - data : array-like + data : array_like Data structure that can be coerced into a PanelData. In most cases, this should be a multi-index DataFrame where the level 0 index contains the entities and the level 1 contains the time. - weights: array-like, optional + weights: array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual times the weight should be homoskedastic. Returns ------- - model : PooledOLS + PooledOLS Model specified using the formula Notes @@ -518,12 +572,12 @@ def from_formula(cls, formula, data, *, weights=None): def _choose_cov(self, cov_type, **cov_config): cov_est = self._cov_estimators[cov_type] - if cov_type != 'clustered': + if cov_type != "clustered": return cov_est, cov_config cov_config_upd = {k: v for k, v in cov_config.items()} - clusters = cov_config.get('clusters', None) + clusters = cov_config.get("clusters", None) if clusters is not None: clusters = self.reformat_clusters(clusters).copy() for col in clusters.dataframe: @@ -531,25 +585,21 @@ def _choose_cov(self, cov_type, **cov_config): clusters.dataframe[col] = cat.codes.astype(np.int64) clusters = clusters.dataframe - cluster_entity = cov_config_upd.pop('cluster_entity', False) + cluster_entity = cov_config_upd.pop("cluster_entity", False) if cluster_entity: group_ids = self.dependent.entity_ids.squeeze() - name = 'cov.cluster.entity' - group_ids = pd.Series(group_ids, - index=self.dependent.index, - name=name) + name = "cov.cluster.entity" + group_ids = pd.Series(group_ids, index=self.dependent.index, name=name) if clusters is not None: clusters[name] = group_ids else: clusters = pd.DataFrame(group_ids) - cluster_time = cov_config_upd.pop('cluster_time', False) + cluster_time = cov_config_upd.pop("cluster_time", False) if cluster_time: group_ids = self.dependent.time_ids.squeeze() - name = 'cov.cluster.time' - group_ids = pd.Series(group_ids, - index=self.dependent.index, - name=name) + name = "cov.cluster.time" + group_ids = pd.Series(group_ids, index=self.dependent.index, name=name) if clusters is not None: clusters[name] = group_ids else: @@ -557,11 +607,13 @@ def _choose_cov(self, cov_type, **cov_config): if self._singleton_index is not None and clusters is not None: clusters = clusters.loc[~self._singleton_index] - cov_config_upd['clusters'] = np.asarray(clusters) if clusters is not None else clusters + cov_config_upd["clusters"] = ( + np.asarray(clusters) if clusters is not None else clusters + ) return cov_est, cov_config_upd - def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): + def fit(self, *, cov_type="unadjusted", debiased=True, **cov_config): """ Estimate model parameters @@ -577,7 +629,7 @@ def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): Returns ------- - results : PanelResults + PanelResults Estimation results Examples @@ -623,14 +675,23 @@ def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): df_model = x.shape[1] df_resid = nobs - df_model cov_est, cov_config = self._choose_cov(cov_type, **cov_config) - cov = cov_est(wy, wx, params, self.dependent.entity_ids, self.dependent.time_ids, - debiased=debiased, **cov_config) + cov = cov_est( + wy, + wx, + params, + self.dependent.entity_ids, + self.dependent.time_ids, + debiased=debiased, + **cov_config + ) weps = wy - wx @ params index = self.dependent.index - fitted = pd.DataFrame(x @ params, index, ['fitted_values']) - effects = pd.DataFrame(np.full_like(fitted.values, np.nan), index, ['estimated_effects']) + fitted = pd.DataFrame(x @ params, index, ["fitted_values"]) + effects = pd.DataFrame( + np.full_like(fitted.values, np.nan), index, ["estimated_effects"] + ) eps = y - fitted.values - idiosyncratic = pd.DataFrame(eps, index, ['idiosyncratic']) + idiosyncratic = pd.DataFrame(eps, index, ["idiosyncratic"]) residual_ss = float(weps.T @ weps) e = y if self._constant: @@ -639,11 +700,25 @@ def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): total_ss = float(w.T @ (e ** 2)) r2 = 1 - residual_ss / total_ss - res = self._postestimation(params, cov, debiased, df_resid, weps, wy, wx, root_w) - res.update(dict(df_resid=df_resid, df_model=df_model, nobs=y.shape[0], - residual_ss=residual_ss, total_ss=total_ss, r2=r2, wresids=weps, - resids=eps, index=self.dependent.index, fitted=fitted, effects=effects, - idiosyncratic=idiosyncratic)) + res = self._postestimation( + params, cov, debiased, df_resid, weps, wy, wx, root_w + ) + res.update( + dict( + df_resid=df_resid, + df_model=df_model, + nobs=y.shape[0], + residual_ss=residual_ss, + total_ss=total_ss, + r2=r2, + wresids=weps, + resids=eps, + index=self.dependent.index, + fitted=fitted, + effects=effects, + idiosyncratic=idiosyncratic, + ) + ) return PanelResults(res) @@ -653,9 +728,9 @@ def predict(self, params, *, exog=None, data=None, eval_env=4): Parameters ---------- - params : array-like + params : array_like Model parameters (nvar by 1) - exog : array-like + exog : array_like Exogenous regressors (nobs by nvar) data : DataFrame Values to use when making predictions from a model constructed @@ -665,7 +740,7 @@ def predict(self, params, *, exog=None, data=None, eval_env=4): Returns ------- - predictions : DataFrame + DataFrame Fitted values from supplied data and parameters Notes @@ -678,11 +753,14 @@ def predict(self, params, *, exog=None, data=None, eval_env=4): values corresponding to the original model specification. """ if data is not None and self.formula is None: - raise ValueError('Unable to use data when the model was not ' - 'created using a formula.') + raise ValueError( + "Unable to use data when the model was not " "created using a formula." + ) if data is not None and exog is not None: - raise ValueError('Predictions can only be constructed using one ' - 'of exog or data, but not both.') + raise ValueError( + "Predictions can only be constructed using one " + "of exog or data, but not both." + ) if exog is not None: exog = PanelData(exog).dataframe else: @@ -692,7 +770,7 @@ def predict(self, params, *, exog=None, data=None, eval_env=4): params = np.atleast_2d(np.asarray(params)) if params.shape[0] == 1: params = params.T - pred = pd.DataFrame(x @ params, index=exog.index, columns=['predictions']) + pred = pd.DataFrame(x @ params, index=exog.index, columns=["predictions"]) return pred @@ -703,11 +781,11 @@ class PanelOLS(PooledOLS): Parameters ---------- - dependent : array-like + dependent : array_like Dependent (left-hand-side) variable (time by entity). - exog : array-like + exog : array_like Exogenous or right-hand-side variables (variable by time by entity). - weights : array-like, optional + weights : array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual time the weight should be homoskedastic. @@ -715,7 +793,7 @@ class PanelOLS(PooledOLS): Flag whether to include entity (fixed) effects in the model time_effects : bool, optional Flag whether to include time effects in the model - other_effects : array-like, optional + other_effects : array_like, optional Category codes to use for any effects that are not entity or time effects. Each variable is treated as an effect. singletons : bool, optional @@ -757,8 +835,18 @@ class PanelOLS(PooledOLS): 2 other. """ - def __init__(self, dependent, exog, *, weights=None, entity_effects=False, time_effects=False, - other_effects=None, singletons=True, drop_absorbed=False): + def __init__( + self, + dependent, + exog, + *, + weights=None, + entity_effects=False, + time_effects=False, + other_effects=None, + singletons=True, + drop_absorbed=False + ): super(PanelOLS, self).__init__(dependent, exog, weights=weights) self._entity_effects = entity_effects @@ -795,9 +883,12 @@ def _drop_singletons(self): return import warnings as warn + nobs = retain.shape[0] ndropped = nobs - retain.sum() - warn.warn('{0} singleton observations dropped'.format(ndropped), SingletonWarning) + warn.warn( + "{0} singleton observations dropped".format(ndropped), SingletonWarning + ) drop = ~retain self._singleton_index = drop self.dependent.drop(drop) @@ -810,11 +901,15 @@ def _drop_singletons(self): def __str__(self): out = super(PanelOLS, self).__str__() - additional = '\nEntity Effects: {ee}, Time Effects: {te}, Num Other Effects: {oe}' + additional = ( + "\nEntity Effects: {ee}, Time Effects: {te}, Num Other Effects: {oe}" + ) oe = 0 if self.other_effects: oe = self._other_effect_cats.nvar - additional = additional.format(ee=self.entity_effects, te=self.time_effects, oe=oe) + additional = additional.format( + ee=self.entity_effects, te=self.time_effects, oe=oe + ) out += additional return out @@ -822,16 +917,17 @@ def _validate_effects(self, effects): """Check model effects""" if effects is None: return False - effects = PanelData(effects, var_name='OtherEffect', - convert_dummies=False) + effects = PanelData(effects, var_name="OtherEffect", convert_dummies=False) if effects.shape[1:] != self._original_shape[1:]: - raise ValueError('other_effects must have the same number of ' - 'entities and time periods as dependent.') + raise ValueError( + "other_effects must have the same number of " + "entities and time periods as dependent." + ) num_effects = effects.nvar if num_effects + self.entity_effects + self.time_effects > 2: - raise ValueError('At most two effects supported.') + raise ValueError("At most two effects supported.") cats = {} effects_frame = effects.dataframe for col in effects_frame: @@ -847,18 +943,20 @@ def _validate_effects(self, effects): if cats.shape[1] == 2: nested = self._is_effect_nested(cats[:, [0]], cats[:, [1]]) nested |= self._is_effect_nested(cats[:, [1]], cats[:, [0]]) - nesting_effect = 'other effects' + nesting_effect = "other effects" elif self.entity_effects: nested = self._is_effect_nested(cats[:, [0]], self.dependent.entity_ids) nested |= self._is_effect_nested(self.dependent.entity_ids, cats[:, [0]]) - nesting_effect = 'entity effects' + nesting_effect = "entity effects" elif self.time_effects: nested = self._is_effect_nested(cats[:, [0]], self.dependent.time_ids) nested |= self._is_effect_nested(self.dependent.time_ids, cats[:, [0]]) - nesting_effect = 'time effects' + nesting_effect = "time effects" if nested: - raise ValueError('Included other effects nest or are nested ' - 'by {effect}'.format(effect=nesting_effect)) + raise ValueError( + "Included other effects nest or are nested " + "by {effect}".format(effect=nesting_effect) + ) return True @@ -878,8 +976,16 @@ def other_effects(self): return self._other_effects @classmethod - def from_formula(cls, formula, data, *, weights=None, other_effects=None, - singletons=True, drop_absorbed=False): + def from_formula( + cls, + formula, + data, + *, + weights=None, + other_effects=None, + singletons=True, + drop_absorbed=False + ): """ Create a model from a formula @@ -890,15 +996,15 @@ def from_formula(cls, formula, data, *, weights=None, other_effects=None, with two special variable names, EntityEffects and TimeEffects which can be used to specify that the model should contain an entity effect or a time effect, respectively. See Examples. - data : array-like + data : array_like Data structure that can be coerced into a PanelData. In most cases, this should be a multi-index DataFrame where the level 0 index contains the entities and the level 1 contains the time. - weights: array-like + weights: array_like Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual time the weight should be homoskedastic. - other_effects : array-like, optional + other_effects : array_like, optional Category codes to use for any effects that are not entity or time effects. Each variable is treated as an effect. singletons : bool, optional @@ -910,7 +1016,7 @@ def from_formula(cls, formula, data, *, weights=None, other_effects=None, Returns ------- - model : PanelOLS + PanelOLS Model specified using the formula Examples @@ -923,9 +1029,16 @@ def from_formula(cls, formula, data, *, weights=None, other_effects=None, entity_effect = parser.entity_effect time_effect = parser.time_effect dependent, exog = parser.data - mod = cls(dependent, exog, entity_effects=entity_effect, time_effects=time_effect, - weights=weights, other_effects=other_effects, singletons=singletons, - drop_absorbed=drop_absorbed) + mod = cls( + dependent, + exog, + entity_effects=entity_effect, + time_effects=time_effect, + weights=weights, + other_effects=other_effects, + singletons=singletons, + drop_absorbed=drop_absorbed, + ) mod.formula = formula return mod @@ -972,7 +1085,7 @@ def _lsmr_path(self): wy_mean = csc_matrix(wy_mean) # Purge fitted, weighted values - sp_cond = diags(cond, format='csc') + sp_cond = diags(cond, format="csc") wx = wx - (wd @ sp_cond @ wx_mean).A wy = wy - (wd @ sp_cond @ wy_mean).A @@ -1001,15 +1114,17 @@ def _slow_path(self): drop_first = self._constant d = [] if self.entity_effects: - d.append(self.dependent.dummies('entity', drop_first=drop_first).values) + d.append(self.dependent.dummies("entity", drop_first=drop_first).values) drop_first = True if self.time_effects: - d.append(self.dependent.dummies('time', drop_first=drop_first).values) + d.append(self.dependent.dummies("time", drop_first=drop_first).values) drop_first = True if self.other_effects: oe = self._other_effect_cats.dataframe for c in oe: - dummies = pd.get_dummies(oe[c], drop_first=drop_first).astype(np.float64) + dummies = pd.get_dummies(oe[c], drop_first=drop_first).astype( + np.float64 + ) d.append(dummies.values) drop_first = True @@ -1046,10 +1161,14 @@ def _choose_twoway_algo(self): low_memory = reg_size > 2 ** 10 if low_memory: import warnings - warnings.warn('Using low-memory algorithm to estimate two-way model. Explicitly set ' - 'low_memory=True to silence this message. Set low_memory=False to use ' - 'the standard algorithm that creates dummy variables for the smaller of ' - 'the number of entities or number of time periods.', MemoryWarning) + + warnings.warn( + "Using low-memory algorithm to estimate two-way model. Explicitly set " + "low_memory=True to silence this message. Set low_memory=False to use " + "the standard algorithm that creates dummy variables for the smaller of " + "the number of entities or number of time periods.", + MemoryWarning, + ) return low_memory def _fast_path(self, low_memory): @@ -1075,19 +1194,19 @@ def _fast_path(self, low_memory): effect = self.dependent.entity_ids else: effect = self.dependent.time_ids - col = ensure_unique_column('additional.effect', groups.dataframe) + col = ensure_unique_column("additional.effect", groups.dataframe) groups.dataframe[col] = effect y = y.general_demean(groups) x = x.general_demean(groups) elif self.entity_effects and self.time_effects: - y = y.demean('both', low_memory=low_memory) - x = x.demean('both', low_memory=low_memory) + y = y.demean("both", low_memory=low_memory) + x = x.demean("both", low_memory=low_memory) elif self.entity_effects: - y = y.demean('entity') - x = x.demean('entity') + y = y.demean("entity") + x = x.demean("entity") else: # self.time_effects - y = y.demean('time') - x = x.demean('time') + y = y.demean("time") + x = x.demean("time") y = y.values2d x = x.values2d @@ -1127,19 +1246,19 @@ def _weighted_fast_path(self, low_memory): effect = self.dependent.entity_ids else: effect = self.dependent.time_ids - col = ensure_unique_column('additional.effect', groups.dataframe) + col = ensure_unique_column("additional.effect", groups.dataframe) groups.dataframe[col] = effect wy = y.general_demean(groups, weights=self.weights) wx = x.general_demean(groups, weights=self.weights) elif self.entity_effects and self.time_effects: - wy = y.demean('both', weights=self.weights, low_memory=low_memory) - wx = x.demean('both', weights=self.weights, low_memory=low_memory) + wy = y.demean("both", weights=self.weights, low_memory=low_memory) + wx = x.demean("both", weights=self.weights, low_memory=low_memory) elif self.entity_effects: - wy = y.demean('entity', weights=self.weights) - wx = x.demean('entity', weights=self.weights) + wy = y.demean("entity", weights=self.weights) + wx = x.demean("entity", weights=self.weights) else: # self.time_effects - wy = y.demean('time', weights=self.weights) - wx = x.demean('time', weights=self.weights) + wy = y.demean("time", weights=self.weights) + wx = x.demean("time", weights=self.weights) wy = wy.values2d wx = wx.values2d @@ -1164,8 +1283,10 @@ def _info(self): other_info = [] oe = self._other_effect_cats.dataframe for c in oe: - name = 'Observations per group (' + str(c) + ')' - other_info.append(panel_structure_stats(oe[c].values.astype(np.int32), name)) + name = "Observations per group (" + str(c) + ")" + other_info.append( + panel_structure_stats(oe[c].values.astype(np.int32), name) + ) other_info = pd.DataFrame(other_info) return entity_info, time_info, other_info @@ -1185,13 +1306,13 @@ def _is_effect_nested(effects, clusters): return np.all(is_nested) def _determine_df_adjustment(self, cov_type, **cov_config): - if cov_type != 'clustered' or not self._has_effect: + if cov_type != "clustered" or not self._has_effect: return True num_effects = self.entity_effects + self.time_effects if self.other_effects: num_effects += self._other_effect_cats.shape[1] - clusters = cov_config.get('clusters', None) + clusters = cov_config.get("clusters", None) if clusters is None: # No clusters return True @@ -1200,8 +1321,18 @@ def _determine_df_adjustment(self, cov_type, **cov_config): return not self._is_effect_nested(effects, clusters) return True # Default case for 2-way -- not completely clear - def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unadjusted', - debiased=True, auto_df=True, count_effects=True, **cov_config): + def fit( + self, + *, + use_lsdv=False, + use_lsmr=False, + low_memory=None, + cov_type="unadjusted", + debiased=True, + auto_df=True, + count_effects=True, + **cov_config + ): """ Estimate model parameters @@ -1240,7 +1371,7 @@ def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unad Returns ------- - results : PanelEffectsResults + PanelEffectsResults Estimation results Examples @@ -1281,13 +1412,17 @@ def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unad elif use_lsdv: y, x, ybar, y_effects, x_effects = self._slow_path() else: - low_memory = self._choose_twoway_algo() if low_memory is None else low_memory + low_memory = ( + self._choose_twoway_algo() if low_memory is None else low_memory + ) if not weighted: y, x, ybar = self._fast_path(low_memory=low_memory) y_effects = 0.0 x_effects = np.zeros(x.shape[1]) else: - y, x, ybar, y_effects, x_effects = self._weighted_fast_path(low_memory=low_memory) + y, x, ybar, y_effects, x_effects = self._weighted_fast_path( + low_memory=low_memory + ) neffects = 0 drop_first = self.has_constant @@ -1310,10 +1445,13 @@ def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unad retain = not_absorbed(x) if len(retain) != x.shape[1]: drop = set(range(x.shape[1])).difference(retain) - dropped = ', '.join([self.exog.vars[i] for i in drop]) + dropped = ", ".join([self.exog.vars[i] for i in drop]) import warnings - warnings.warn(absorbing_warn_msg.format(absorbed_variables=dropped), - AbsorbingEffectWarning) + + warnings.warn( + absorbing_warn_msg.format(absorbed_variables=dropped), + AbsorbingEffectWarning, + ) x = x[:, retain] # Adjust exog self.exog = PanelData(self.exog.dataframe.iloc[:, retain]) @@ -1329,8 +1467,16 @@ def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unad count_effects = self._determine_df_adjustment(cov_type, **cov_config) extra_df = neffects if count_effects else 0 - cov = cov_est(y, x, params, self.dependent.entity_ids, self.dependent.time_ids, - debiased=debiased, extra_df=extra_df, **cov_config) + cov = cov_est( + y, + x, + params, + self.dependent.entity_ids, + self.dependent.time_ids, + debiased=debiased, + extra_df=extra_df, + **cov_config + ) weps = y - x @ params eps = weps _y = self.dependent.values2d @@ -1342,8 +1488,8 @@ def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unad w = self.weights.values2d eps -= (w * eps).sum() / w.sum() index = self.dependent.index - fitted = pd.DataFrame(_x @ params, index, ['fitted_values']) - idiosyncratic = pd.DataFrame(eps, index, ['idiosyncratic']) + fitted = pd.DataFrame(_x @ params, index, ["fitted_values"]) + idiosyncratic = pd.DataFrame(eps, index, ["idiosyncratic"]) eps_effects = _y - fitted.values sigma2_tot = float(eps_effects.T @ eps_effects / nobs) @@ -1362,7 +1508,12 @@ def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unad root_w = np.sqrt(self.weights.values2d) y_ex = root_w * self.dependent.values2d mu_ex = 0 - if self.has_constant or self.entity_effects or self.time_effects or self.other_effects: + if ( + self.has_constant + or self.entity_effects + or self.time_effects + or self.other_effects + ): mu_ex = root_w * ((root_w.T @ y_ex) / (root_w.T @ root_w)) total_ss_ex_effect = float((y_ex - mu_ex).T @ (y_ex - mu_ex)) r2_ex_effects = 1 - resid_ss / total_ss_ex_effect @@ -1385,22 +1536,48 @@ def fit(self, *, use_lsdv=False, use_lsmr=False, low_memory=None, cov_type='unad denom = resid_ss / df_denom stat = num / denom - f_pooled = WaldTestStatistic(stat, 'Effects are zero', - df_num, df_denom=df_denom, - name='Pooled F-statistic') + f_pooled = WaldTestStatistic( + stat, + "Effects are zero", + df_num, + df_denom=df_denom, + name="Pooled F-statistic", + ) res.update(f_pooled=f_pooled) - effects = pd.DataFrame(eps_effects - eps, columns=['estimated_effects'], - index=self.dependent.index) + effects = pd.DataFrame( + eps_effects - eps, + columns=["estimated_effects"], + index=self.dependent.index, + ) else: - effects = pd.DataFrame(np.zeros_like(eps), columns=['estimated_effects'], - index=self.dependent.index) - - res.update(dict(df_resid=df_resid, df_model=df_model, nobs=y.shape[0], - residual_ss=resid_ss, total_ss=total_ss, wresids=weps, resids=eps, - r2=r2, entity_effects=self.entity_effects, time_effects=self.time_effects, - other_effects=self.other_effects, sigma2_eps=sigma2_eps, - sigma2_effects=sigma2_effects, rho=rho, r2_ex_effects=r2_ex_effects, - effects=effects, fitted=fitted, idiosyncratic=idiosyncratic)) + effects = pd.DataFrame( + np.zeros_like(eps), + columns=["estimated_effects"], + index=self.dependent.index, + ) + + res.update( + dict( + df_resid=df_resid, + df_model=df_model, + nobs=y.shape[0], + residual_ss=resid_ss, + total_ss=total_ss, + wresids=weps, + resids=eps, + r2=r2, + entity_effects=self.entity_effects, + time_effects=self.time_effects, + other_effects=self.other_effects, + sigma2_eps=sigma2_eps, + sigma2_effects=sigma2_effects, + rho=rho, + r2_ex_effects=r2_ex_effects, + effects=effects, + fitted=fitted, + idiosyncratic=idiosyncratic, + ) + ) return PanelEffectsResults(res) @@ -1411,11 +1588,11 @@ class BetweenOLS(PooledOLS): Parameters ---------- - dependent : array-like + dependent : array_like Dependent (left-hand-side) variable (time by entity) - exog : array-like + exog : array_like Exogenous or right-hand-side variables (variable by time by entity). - weights : array-like, optional + weights : array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual time the weight should be homoskedastic. @@ -1433,33 +1610,39 @@ class BetweenOLS(PooledOLS): def __init__(self, dependent, exog, *, weights=None): super(BetweenOLS, self).__init__(dependent, exog, weights=weights) - self._cov_estimators = CovarianceManager(self.__class__.__name__, HomoskedasticCovariance, - HeteroskedasticCovariance, ClusteredCovariance) + self._cov_estimators = CovarianceManager( + self.__class__.__name__, + HomoskedasticCovariance, + HeteroskedasticCovariance, + ClusteredCovariance, + ) def _choose_cov(self, cov_type, **cov_config): """Return covariance estimator reformat clusters""" cov_est = self._cov_estimators[cov_type] - if cov_type != 'clustered': + if cov_type != "clustered": return cov_est, cov_config cov_config_upd = {k: v for k, v in cov_config.items()} - clusters = cov_config.get('clusters', None) + clusters = cov_config.get("clusters", None) if clusters is not None: clusters = self.reformat_clusters(clusters).copy() cluster_max = np.nanmax(clusters.values3d, axis=1) delta = cluster_max - np.nanmin(clusters.values3d, axis=1) if np.any(delta != 0): - raise ValueError('clusters must not vary within an entity') + raise ValueError("clusters must not vary within an entity") index = clusters.panel.minor_axis reindex = clusters.entities clusters = pd.DataFrame(cluster_max.T, index=index, columns=clusters.vars) clusters = clusters.loc[reindex].astype(np.int64) - cov_config_upd['clusters'] = clusters + cov_config_upd["clusters"] = clusters return cov_est, cov_config_upd - def fit(self, *, reweight=False, cov_type='unadjusted', debiased=True, **cov_config): + def fit( + self, *, reweight=False, cov_type="unadjusted", debiased=True, **cov_config + ): """ Estimate model parameters @@ -1479,7 +1662,7 @@ def fit(self, *, reweight=False, cov_type='unadjusted', debiased=True, **cov_con Returns ------- - results : PanelResults + PanelResults Estimation results Examples @@ -1515,24 +1698,34 @@ def fit(self, *, reweight=False, cov_type='unadjusted', debiased=True, **cov_con params = lstsq(wx, wy)[0] df_resid = y.shape[0] - x.shape[1] - df_model = x.shape[1], + df_model = (x.shape[1],) nobs = y.shape[0] cov_est, cov_config = self._choose_cov(cov_type, **cov_config) - cov = cov_est(wy, wx, params, self.dependent.entity_ids, self.dependent.time_ids, - debiased=debiased, **cov_config) + cov = cov_est( + wy, + wx, + params, + self.dependent.entity_ids, + self.dependent.time_ids, + debiased=debiased, + **cov_config + ) weps = wy - wx @ params index = self.dependent.index - fitted = pd.DataFrame(self.exog.values2d @ params, index, ['fitted_values']) + fitted = pd.DataFrame(self.exog.values2d @ params, index, ["fitted_values"]) eps = y - x @ params - effects = pd.DataFrame(eps, self.dependent.entities, ['estimated_effects']) + effects = pd.DataFrame(eps, self.dependent.entities, ["estimated_effects"]) entities = fitted.index.levels[0][get_codes(fitted.index)[0]] effects = effects.loc[entities] effects.index = fitted.index dep = self.dependent.dataframe fitted = fitted.reindex(dep.index) effects = effects.reindex(dep.index) - idiosyncratic = pd.DataFrame(np.asarray(dep) - np.asarray(fitted) - np.asarray(effects), - dep.index, ['idiosyncratic']) + idiosyncratic = pd.DataFrame( + np.asarray(dep) - np.asarray(fitted) - np.asarray(effects), + dep.index, + ["idiosyncratic"], + ) residual_ss = float(weps.T @ weps) e = y @@ -1542,11 +1735,25 @@ def fit(self, *, reweight=False, cov_type='unadjusted', debiased=True, **cov_con total_ss = float(w.T @ (e ** 2)) r2 = 1 - residual_ss / total_ss - res = self._postestimation(params, cov, debiased, df_resid, weps, wy, wx, root_w) - res.update(dict(df_resid=df_resid, df_model=df_model, nobs=nobs, - residual_ss=residual_ss, total_ss=total_ss, r2=r2, wresids=weps, - resids=eps, index=self.dependent.entities, fitted=fitted, effects=effects, - idiosyncratic=idiosyncratic)) + res = self._postestimation( + params, cov, debiased, df_resid, weps, wy, wx, root_w + ) + res.update( + dict( + df_resid=df_resid, + df_model=df_model, + nobs=nobs, + residual_ss=residual_ss, + total_ss=total_ss, + r2=r2, + wresids=weps, + resids=eps, + index=self.dependent.entities, + fitted=fitted, + effects=effects, + idiosyncratic=idiosyncratic, + ) + ) return PanelResults(res) @@ -1559,18 +1766,18 @@ def from_formula(cls, formula, data, *, weights=None): ---------- formula : str Formula to transform into model. Conforms to patsy formula rules. - data : array-like + data : array_like Data structure that can be coerced into a PanelData. In most cases, this should be a multi-index DataFrame where the level 0 index contains the entities and the level 1 contains the time. - weights: array-like, optional + weights: array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual times the weight should be homoskedastic. Returns ------- - model : BetweenOLS + BetweenOLS Model specified using the formula Notes @@ -1597,11 +1804,11 @@ class FirstDifferenceOLS(PooledOLS): Parameters ---------- - dependent : array-like + dependent : array_like Dependent (left-hand-side) variable (time by entity) - exog : array-like + exog : array_like Exogenous or right-hand-side variables (variable by time by entity). - weights : array-like, optional + weights : array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual time the weight should be homoskedastic. @@ -1618,51 +1825,60 @@ class FirstDifferenceOLS(PooledOLS): def __init__(self, dependent, exog, *, weights=None): super(FirstDifferenceOLS, self).__init__(dependent, exog, weights=weights) if self._constant: - raise ValueError('Constants are not allowed in first difference regressions.') + raise ValueError( + "Constants are not allowed in first difference regressions." + ) if self.dependent.nobs < 2: - raise ValueError('Panel must have at least 2 time periods') + raise ValueError("Panel must have at least 2 time periods") def _choose_cov(self, cov_type, **cov_config): """Return covariance estimator and reformat clusters""" cov_est = self._cov_estimators[cov_type] - if cov_type != 'clustered': + if cov_type != "clustered": return cov_est, cov_config cov_config_upd = {k: v for k, v in cov_config.items()} - clusters = cov_config.get('clusters', None) + clusters = cov_config.get("clusters", None) if clusters is not None: clusters = self.reformat_clusters(clusters).copy() fd = clusters.first_difference() fd = fd.values2d if np.any(fd.flat[np.isfinite(fd.flat)] != 0): - raise ValueError('clusters must be identical for values used ' - 'to compute the first difference') + raise ValueError( + "clusters must be identical for values used " + "to compute the first difference" + ) clusters = clusters.dataframe.copy() - cluster_entity = cov_config_upd.pop('cluster_entity', False) + cluster_entity = cov_config_upd.pop("cluster_entity", False) if cluster_entity: group_ids = self.dependent.entity_ids.squeeze() - name = 'cov.cluster.entity' - group_ids = pd.Series(group_ids, - index=self.dependent.index, - name=name) + name = "cov.cluster.entity" + group_ids = pd.Series(group_ids, index=self.dependent.index, name=name) if clusters is not None: clusters[name] = group_ids else: clusters = pd.DataFrame(group_ids) clusters = PanelData(clusters) values = clusters.values3d[:, 1:] - clusters = panel_to_frame(values, clusters.panel.items, clusters.panel.major_axis[1:], - clusters.panel.minor_axis, True) + clusters = panel_to_frame( + values, + clusters.panel.items, + clusters.panel.major_axis[1:], + clusters.panel.minor_axis, + True, + ) clusters = PanelData(clusters).dataframe clusters = clusters.loc[self.dependent.first_difference().index] clusters = clusters.astype(np.int64) - cov_config_upd['clusters'] = clusters.values if clusters is not None else clusters + cov_config_upd["clusters"] = ( + clusters.values if clusters is not None else clusters + ) return cov_est, cov_config_upd - def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): + def fit(self, *, cov_type="unadjusted", debiased=True, **cov_config): """ Estimate model parameters @@ -1678,7 +1894,7 @@ def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): Returns ------- - results : PanelResults + PanelResults Estimation results Examples @@ -1728,9 +1944,14 @@ def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): w = 1.0 / self.weights.values3d w = w[:, :-1] + w[:, 1:] w = 1.0 / w - w = panel_to_frame(w, self.weights.panel.items, self.weights.panel.major_axis[1:], - self.weights.panel.minor_axis, True) - w = w.reindex(self.weights.index).dropna(how='any') + w = panel_to_frame( + w, + self.weights.panel.items, + self.weights.panel.major_axis[1:], + self.weights.panel.minor_axis, + True, + ) + w = w.reindex(self.weights.index).dropna(how="any") index = w.index w = w.values @@ -1742,26 +1963,49 @@ def fit(self, *, cov_type='unadjusted', debiased=True, **cov_config): params = lstsq(wx, wy)[0] df_resid = y.shape[0] - x.shape[1] cov_est, cov_config = self._choose_cov(cov_type, **cov_config) - cov = cov_est(wy, wx, params, entity_ids, time_ids, debiased=debiased, **cov_config) + cov = cov_est( + wy, wx, params, entity_ids, time_ids, debiased=debiased, **cov_config + ) weps = wy - wx @ params - fitted = pd.DataFrame(self.exog.values2d @ params, - self.dependent.index, ['fitted_values']) - idiosyncratic = pd.DataFrame(self.dependent.values2d - fitted.values, - self.dependent.index, ['idiosyncratic']) - effects = pd.DataFrame(np.full_like(fitted.values, np.nan), self.dependent.index, - ['estimated_effects']) + fitted = pd.DataFrame( + self.exog.values2d @ params, self.dependent.index, ["fitted_values"] + ) + idiosyncratic = pd.DataFrame( + self.dependent.values2d - fitted.values, + self.dependent.index, + ["idiosyncratic"], + ) + effects = pd.DataFrame( + np.full_like(fitted.values, np.nan), + self.dependent.index, + ["estimated_effects"], + ) eps = y - x @ params residual_ss = float(weps.T @ weps) total_ss = float(w.T @ (y ** 2)) r2 = 1 - residual_ss / total_ss - res = self._postestimation(params, cov, debiased, df_resid, weps, wy, wx, root_w) - res.update(dict(df_resid=df_resid, df_model=x.shape[1], nobs=y.shape[0], - residual_ss=residual_ss, total_ss=total_ss, r2=r2, - resids=eps, wresids=weps, index=index, fitted=fitted, effects=effects, - idiosyncratic=idiosyncratic)) + res = self._postestimation( + params, cov, debiased, df_resid, weps, wy, wx, root_w + ) + res.update( + dict( + df_resid=df_resid, + df_model=x.shape[1], + nobs=y.shape[0], + residual_ss=residual_ss, + total_ss=total_ss, + r2=r2, + resids=eps, + wresids=weps, + index=index, + fitted=fitted, + effects=effects, + idiosyncratic=idiosyncratic, + ) + ) return PanelResults(res) @@ -1774,18 +2018,18 @@ def from_formula(cls, formula, data, *, weights=None): ---------- formula : str Formula to transform into model. Conforms to patsy formula rules. - data : array-like + data : array_like Data structure that can be coerced into a PanelData. In most cases, this should be a multi-index DataFrame where the level 0 index contains the entities and the level 1 contains the time. - weights: array-like, optional + weights: array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual times the weight should be homoskedastic. Returns ------- - model : FirstDifferenceOLS + FirstDifferenceOLS Model specified using the formula Notes @@ -1812,11 +2056,11 @@ class RandomEffects(PooledOLS): Parameters ---------- - dependent : array-like + dependent : array_like Dependent (left-hand-side) variable (time by entity) - exog : array-like + exog : array_like Exogenous or right-hand-side variables (variable by time by entity). - weights : array-like, optional + weights : array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual time the weight should be homoskedastic. @@ -1842,18 +2086,18 @@ def from_formula(cls, formula, data, *, weights=None): ---------- formula : str Formula to transform into model. Conforms to patsy formula rules. - data : array-like + data : array_like Data structure that can be coerced into a PanelData. In most cases, this should be a multi-index DataFrame where the level 0 index contains the entities and the level 1 contains the time. - weights: array-like, optional + weights: array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual times the weight should be homoskedastic. Returns ------- - model : RandomEffects + RandomEffects Model specified using the formula Notes @@ -1873,11 +2117,13 @@ def from_formula(cls, formula, data, *, weights=None): mod.formula = formula return mod - def fit(self, *, small_sample=False, cov_type='unadjusted', debiased=True, **cov_config): + def fit( + self, *, small_sample=False, cov_type="unadjusted", debiased=True, **cov_config + ): w = self.weights.values2d root_w = np.sqrt(w) - y = self.dependent.demean('entity', weights=self.weights).values2d - x = self.exog.demean('entity', weights=self.weights).values2d + y = self.dependent.demean("entity", weights=self.weights).values2d + x = self.exog.demean("entity", weights=self.weights).values2d if self.has_constant: w_sum = w.sum() y_gm = (w * self.dependent.values2d).sum(0) / w_sum @@ -1887,8 +2133,8 @@ def fit(self, *, small_sample=False, cov_type='unadjusted', debiased=True, **cov params = lstsq(x, y)[0] weps = y - x @ params - wybar = self.dependent.mean('entity', weights=self.weights) - wxbar = self.exog.mean('entity', weights=self.weights) + wybar = self.dependent.mean("entity", weights=self.weights) + wxbar = self.exog.mean("entity", weights=self.weights) params = lstsq(wxbar, wybar)[0] wu = np.asarray(wybar) - np.asarray(wxbar) @ params @@ -1897,12 +2143,12 @@ def fit(self, *, small_sample=False, cov_type='unadjusted', debiased=True, **cov nvar = x.shape[1] sigma2_e = float(weps.T @ weps) / (nobs - nvar - neffects + 1) ssr = float(wu.T @ wu) - t = self.dependent.count('entity').values + t = self.dependent.count("entity").values unbalanced = np.ptp(t) != 0 if small_sample and unbalanced: ssr = float((t * wu).T @ wu) wx = root_w * self.exog.dataframe - means = wx.groupby(level=0).transform('mean').values + means = wx.groupby(level=0).transform("mean").values denom = means.T @ means sums = wx.groupby(level=0).sum().values num = sums.T @ sums @@ -1914,7 +2160,7 @@ def fit(self, *, small_sample=False, cov_type='unadjusted', debiased=True, **cov rho = sigma2_u / (sigma2_u + sigma2_e) theta = 1 - np.sqrt(sigma2_e / (t * sigma2_u + sigma2_e)) - theta_out = pd.DataFrame(theta, columns=['theta'], index=wybar.index) + theta_out = pd.DataFrame(theta, columns=["theta"], index=wybar.index) wy = root_w * self.dependent.values2d wx = root_w * self.exog.values2d index = self.dependent.index @@ -1927,16 +2173,26 @@ def fit(self, *, small_sample=False, cov_type='unadjusted', debiased=True, **cov df_resid = wy.shape[0] - wx.shape[1] cov_est, cov_config = self._choose_cov(cov_type, **cov_config) - cov = cov_est(wy, wx, params, self.dependent.entity_ids, self.dependent.time_ids, - debiased=debiased, **cov_config) + cov = cov_est( + wy, + wx, + params, + self.dependent.entity_ids, + self.dependent.time_ids, + debiased=debiased, + **cov_config + ) weps = wy - wx @ params eps = weps / root_w index = self.dependent.index - fitted = pd.DataFrame(self.exog.values2d @ params, index, ['fitted_values']) - effects = pd.DataFrame(self.dependent.values2d - np.asarray(fitted) - eps, index, - ['estimated_effects']) - idiosyncratic = pd.DataFrame(eps, index, ['idiosyncratic']) + fitted = pd.DataFrame(self.exog.values2d @ params, index, ["fitted_values"]) + effects = pd.DataFrame( + self.dependent.values2d - np.asarray(fitted) - eps, + index, + ["estimated_effects"], + ) + idiosyncratic = pd.DataFrame(eps, index, ["idiosyncratic"]) residual_ss = float(weps.T @ weps) wmu = 0 if self.has_constant: @@ -1945,12 +2201,29 @@ def fit(self, *, small_sample=False, cov_type='unadjusted', debiased=True, **cov total_ss = float(wy_demeaned.T @ wy_demeaned) r2 = 1 - residual_ss / total_ss - res = self._postestimation(params, cov, debiased, df_resid, weps, wy, wx, root_w) - res.update(dict(df_resid=df_resid, df_model=x.shape[1], nobs=y.shape[0], - residual_ss=residual_ss, total_ss=total_ss, r2=r2, - resids=eps, wresids=weps, index=index, sigma2_eps=sigma2_e, - sigma2_effects=sigma2_u, rho=rho, theta=theta_out, - fitted=fitted, effects=effects, idiosyncratic=idiosyncratic)) + res = self._postestimation( + params, cov, debiased, df_resid, weps, wy, wx, root_w + ) + res.update( + dict( + df_resid=df_resid, + df_model=x.shape[1], + nobs=y.shape[0], + residual_ss=residual_ss, + total_ss=total_ss, + r2=r2, + resids=eps, + wresids=weps, + index=index, + sigma2_eps=sigma2_e, + sigma2_effects=sigma2_u, + rho=rho, + theta=theta_out, + fitted=fitted, + effects=effects, + idiosyncratic=idiosyncratic, + ) + ) return RandomEffectsResults(res) @@ -1961,11 +2234,11 @@ class FamaMacBeth(PooledOLS): Parameters ---------- - dependent : array-like + dependent : array_like Dependent (left-hand-side) variable (time by entity) - exog : array-like + exog : array_like Exogenous or right-hand-side variables (variable by time by entity). - weights : array-like, optional + weights : array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual time the weight should be homoskedastic. @@ -2006,30 +2279,41 @@ def _validate_blocks(self): wx = root_w * x exog = self.exog.dataframe - wx = pd.DataFrame(wx[self._not_null], index=exog.notnull().index, columns=exog.columns) + wx = pd.DataFrame( + wx[self._not_null], index=exog.notnull().index, columns=exog.columns + ) def validate_block(ex): return ex.shape[0] >= ex.shape[1] and matrix_rank(ex) == ex.shape[1] valid_blocks = wx.groupby(level=1).apply(validate_block) if not valid_blocks.any(): - err = 'Model cannot be estimated. All blocks of time-series observations are rank\n' \ - 'deficient, and so it is not possible to estimate any cross-sectional ' \ - 'regressions.' + err = ( + "Model cannot be estimated. All blocks of time-series observations are rank\n" + "deficient, and so it is not possible to estimate any cross-sectional " + "regressions." + ) raise ValueError(err) if valid_blocks.sum() < exog.shape[1]: import warnings - warnings.warn('The number of time-series observation available to estimate ' - 'cross-sectional\nregressions, {0}, is less than the number of ' - 'parameters in the model. Parameter\ninference is not ' - 'available.'.format(valid_blocks.sum()), InferenceUnavailableWarning) + + warnings.warn( + "The number of time-series observation available to estimate " + "cross-sectional\nregressions, {0}, is less than the number of " + "parameters in the model. Parameter\ninference is not " + "available.".format(valid_blocks.sum()), + InferenceUnavailableWarning, + ) elif valid_blocks.sum() < valid_blocks.shape[0]: import warnings - warnings.warn('{0} of the time-series regressions cannot be estimated due to ' - 'deficient rank.'.format(valid_blocks.shape[0] - valid_blocks.sum()), - MissingValueWarning) - def fit(self, cov_type='unadjusted', debiased=True, **cov_config): + warnings.warn( + "{0} of the time-series regressions cannot be estimated due to " + "deficient rank.".format(valid_blocks.shape[0] - valid_blocks.sum()), + MissingValueWarning, + ) + + def fit(self, cov_type="unadjusted", debiased=True, **cov_config): """ Estimate model parameters @@ -2045,7 +2329,7 @@ def fit(self, cov_type='unadjusted', debiased=True, **cov_config): Returns ------- - results : PanelResults + PanelResults Estimation results Examples @@ -2078,10 +2362,15 @@ def fit(self, cov_type='unadjusted', debiased=True, **cov_config): exog = self.exog.dataframe index = self.dependent.index wy = pd.DataFrame(wy[self._not_null], index=index, columns=dep.columns) - wx = pd.DataFrame(wx[self._not_null], index=exog.notnull().index, columns=exog.columns) + wx = pd.DataFrame( + wx[self._not_null], index=exog.notnull().index, columns=exog.columns + ) - yx = pd.DataFrame(np.c_[wy.values, wx.values], columns=list(wy.columns) + list(wx.columns), - index=wy.index) + yx = pd.DataFrame( + np.c_[wy.values, wx.values], + columns=list(wy.columns) + list(wx.columns), + index=wy.index, + ) def single(z: pd.DataFrame): exog = z.iloc[:, 1:].values @@ -2099,10 +2388,13 @@ def single(z: pd.DataFrame): wy = wy.values wx = wx.values index = self.dependent.index - fitted = pd.DataFrame(self.exog.values2d @ params, index, ['fitted_values']) - effects = pd.DataFrame(np.full_like(fitted.values, np.nan), index, ['estimated_effects']) - idiosyncratic = pd.DataFrame(self.dependent.values2d - fitted.values, index, - ['idiosyncratic']) + fitted = pd.DataFrame(self.exog.values2d @ params, index, ["fitted_values"]) + effects = pd.DataFrame( + np.full_like(fitted.values, np.nan), index, ["estimated_effects"] + ) + idiosyncratic = pd.DataFrame( + self.dependent.values2d - fitted.values, index, ["idiosyncratic"] + ) eps = self.dependent.values2d - fitted.values weps = wy - wx @ params @@ -2116,21 +2408,35 @@ def single(z: pd.DataFrame): total_ss = float(w.T @ (e ** 2)) r2 = 1 - residual_ss / total_ss - if cov_type in ('robust', 'unadjusted', 'homoskedastic', 'heteroskedastic'): + if cov_type in ("robust", "unadjusted", "homoskedastic", "heteroskedastic"): cov_est = FamaMacBethCovariance - elif cov_type == 'kernel': + elif cov_type == "kernel": cov_est = FamaMacBethKernelCovariance else: - raise ValueError('Unknown cov_type') + raise ValueError("Unknown cov_type") cov = cov_est(wy, wx, params, all_params, debiased=debiased, **cov_config) df_resid = wy.shape[0] - params.shape[0] - res = self._postestimation(params, cov, debiased, df_resid, weps, wy, wx, root_w) + res = self._postestimation( + params, cov, debiased, df_resid, weps, wy, wx, root_w + ) index = self.dependent.index - res.update(dict(df_resid=df_resid, df_model=x.shape[1], nobs=y.shape[0], - residual_ss=residual_ss, total_ss=total_ss, - r2=r2, resids=eps, wresids=weps, index=index, fitted=fitted, - effects=effects, idiosyncratic=idiosyncratic)) + res.update( + dict( + df_resid=df_resid, + df_model=x.shape[1], + nobs=y.shape[0], + residual_ss=residual_ss, + total_ss=total_ss, + r2=r2, + resids=eps, + wresids=weps, + index=index, + fitted=fitted, + effects=effects, + idiosyncratic=idiosyncratic, + ) + ) return PanelResults(res) @classmethod @@ -2142,18 +2448,18 @@ def from_formula(cls, formula, data, *, weights=None): ---------- formula : str Formula to transform into model. Conforms to patsy formula rules. - data : array-like + data : array_like Data structure that can be coerced into a PanelData. In most cases, this should be a multi-index DataFrame where the level 0 index contains the entities and the level 1 contains the time. - weights: array-like, optional + weights: array_like, optional Weights to use in estimation. Assumes residual variance is proportional to inverse of weight to that the residual times the weight should be homoskedastic. Returns ------- - model : FamaMacBeth + FamaMacBeth Model specified using the formula Notes diff --git a/linearmodels/panel/results.py b/linearmodels/panel/results.py index 9c35633d22..ddf70e039b 100644 --- a/linearmodels/panel/results.py +++ b/linearmodels/panel/results.py @@ -2,9 +2,9 @@ import datetime as dt -from property_cached import cached_property import numpy as np from pandas import DataFrame, Series, concat +from property_cached import cached_property from scipy import stats from statsmodels.iolib.summary import SimpleTable, fmt_2cols, fmt_params @@ -12,7 +12,7 @@ from linearmodels.utility import (_ModelComparison, _str, _SummaryStr, pval_format, quadratic_form_test) -__all__ = ['PanelResults', 'PanelEffectsResults', 'RandomEffectsResults'] +__all__ = ["PanelResults", "PanelEffectsResults", "RandomEffectsResults"] class PanelResults(_SummaryStr): @@ -56,24 +56,24 @@ def __init__(self, res): @property def params(self): """Estimated parameters""" - return Series(self._params, index=self._var_names, name='parameter') + return Series(self._params, index=self._var_names, name="parameter") @cached_property def cov(self): """Estimated covariance of parameters""" - return DataFrame(self._deferred_cov(), - columns=self._var_names, - index=self._var_names) + return DataFrame( + self._deferred_cov(), columns=self._var_names, index=self._var_names + ) @property def std_errors(self): """Estimated parameter standard errors""" - return Series(np.sqrt(np.diag(self.cov)), self._var_names, name='std_error') + return Series(np.sqrt(np.diag(self.cov)), self._var_names, name="std_error") @property def tstats(self): """Parameter t-statistics""" - return Series(self._params / self.std_errors, name='tstat') + return Series(self._params / self.std_errors, name="tstat") @cached_property def pvalues(self): @@ -85,7 +85,7 @@ def pvalues(self): pv = 2 * (1 - stats.t.cdf(abs_tstats, self.df_resid)) else: pv = 2 * (1 - stats.norm.cdf(abs_tstats)) - return Series(pv, index=self._var_names, name='pvalue') + return Series(pv, index=self._var_names, name="pvalue") @property def df_resid(self): @@ -145,7 +145,7 @@ def rsquared_between(self): Returns ------- - rsquared : float + float Between coefficient of determination Notes @@ -161,7 +161,7 @@ def rsquared_within(self): Returns ------- - rsquared : float + float Within coefficient of determination Notes @@ -177,7 +177,7 @@ def rsquared_overall(self): Returns ------- - rsquared : float + float Between coefficient of determination Notes @@ -214,7 +214,7 @@ def conf_int(self, level=0.95): Returns ------- - ci : DataFrame + DataFrame Confidence interval of the form [lower, upper] for each parameters Notes @@ -228,7 +228,7 @@ def conf_int(self, level=0.95): q = stats.norm.ppf(ci_quantiles) q = q[None, :] ci = self.params[:, None] + self.std_errors[:, None] * q - return DataFrame(ci, index=self._var_names, columns=['lower', 'upper']) + return DataFrame(ci, index=self._var_names, columns=["lower", "upper"]) @property def summary(self): @@ -238,54 +238,59 @@ def summary(self): ``summary.as_html()`` and ``summary.as_latex()``. """ - title = self.name + ' Estimation Summary' + title = self.name + " Estimation Summary" mod = self.model - top_left = [('Dep. Variable:', mod.dependent.vars[0]), - ('Estimator:', self.name), - ('No. Observations:', self.nobs), - ('Date:', self._datetime.strftime('%a, %b %d %Y')), - ('Time:', self._datetime.strftime('%H:%M:%S')), - ('Cov. Estimator:', self._cov_type), - ('', ''), - ('Entities:', str(int(self.entity_info['total']))), - ('Avg Obs:', _str(self.entity_info['mean'])), - ('Min Obs:', _str(self.entity_info['min'])), - ('Max Obs:', _str(self.entity_info['max'])), - ('', ''), - ('Time periods:', str(int(self.time_info['total']))), - ('Avg Obs:', _str(self.time_info['mean'])), - ('Min Obs:', _str(self.time_info['min'])), - ('Max Obs:', _str(self.time_info['max'])), - ('', '')] + top_left = [ + ("Dep. Variable:", mod.dependent.vars[0]), + ("Estimator:", self.name), + ("No. Observations:", self.nobs), + ("Date:", self._datetime.strftime("%a, %b %d %Y")), + ("Time:", self._datetime.strftime("%H:%M:%S")), + ("Cov. Estimator:", self._cov_type), + ("", ""), + ("Entities:", str(int(self.entity_info["total"]))), + ("Avg Obs:", _str(self.entity_info["mean"])), + ("Min Obs:", _str(self.entity_info["min"])), + ("Max Obs:", _str(self.entity_info["max"])), + ("", ""), + ("Time periods:", str(int(self.time_info["total"]))), + ("Avg Obs:", _str(self.time_info["mean"])), + ("Min Obs:", _str(self.time_info["min"])), + ("Max Obs:", _str(self.time_info["max"])), + ("", ""), + ] is_invalid = np.isfinite(self.f_statistic.stat) - f_stat = _str(self.f_statistic.stat) if is_invalid else '--' - f_pval = pval_format(self.f_statistic.pval) if is_invalid else '--' - f_dist = self.f_statistic.dist_name if is_invalid else '--' - - f_robust = _str(self.f_statistic_robust.stat) if is_invalid else '--' - f_robust_pval = pval_format(self.f_statistic_robust.pval) if is_invalid else '--' - f_robust_name = self.f_statistic_robust.dist_name if is_invalid else '--' - - top_right = [('R-squared:', _str(self.rsquared)), - ('R-squared (Between):', _str(self.rsquared_between)), - ('R-squared (Within):', _str(self.rsquared_within)), - ('R-squared (Overall):', _str(self.rsquared_overall)), - ('Log-likelihood', _str(self._loglik)), - ('', ''), - ('F-statistic:', f_stat), - ('P-value', f_pval), - ('Distribution:', f_dist), - ('', ''), - ('F-statistic (robust):', f_robust), - ('P-value', f_robust_pval), - ('Distribution:', f_robust_name), - ('', ''), - ('', ''), - ('', ''), - ('', ''), - ] + f_stat = _str(self.f_statistic.stat) if is_invalid else "--" + f_pval = pval_format(self.f_statistic.pval) if is_invalid else "--" + f_dist = self.f_statistic.dist_name if is_invalid else "--" + + f_robust = _str(self.f_statistic_robust.stat) if is_invalid else "--" + f_robust_pval = ( + pval_format(self.f_statistic_robust.pval) if is_invalid else "--" + ) + f_robust_name = self.f_statistic_robust.dist_name if is_invalid else "--" + + top_right = [ + ("R-squared:", _str(self.rsquared)), + ("R-squared (Between):", _str(self.rsquared_between)), + ("R-squared (Within):", _str(self.rsquared_within)), + ("R-squared (Overall):", _str(self.rsquared_overall)), + ("Log-likelihood", _str(self._loglik)), + ("", ""), + ("F-statistic:", f_stat), + ("P-value", f_pval), + ("Distribution:", f_dist), + ("", ""), + ("F-statistic (robust):", f_robust), + ("P-value", f_robust_pval), + ("Distribution:", f_robust_name), + ("", ""), + ("", ""), + ("", ""), + ("", ""), + ] stubs = [] vals = [] @@ -299,9 +304,9 @@ def summary(self): # Top Table # Parameter table fmt = fmt_2cols - fmt['data_fmts'][1] = '%18s' + fmt["data_fmts"][1] = "%18s" - top_right = [('%-21s' % (' ' + k), v) for k, v in top_right] + top_right = [("%-21s" % (" " + k), v) for k, v in top_right] stubs = [] vals = [] for stub, val in top_right: @@ -310,11 +315,13 @@ def summary(self): table.extend_right(SimpleTable(vals, stubs=stubs)) smry.tables.append(table) - param_data = np.c_[self.params.values[:, None], - self.std_errors.values[:, None], - self.tstats.values[:, None], - self.pvalues.values[:, None], - self.conf_int()] + param_data = np.c_[ + self.params.values[:, None], + self.std_errors.values[:, None], + self.tstats.values[:, None], + self.pvalues.values[:, None], + self.conf_int(), + ] data = [] for row in param_data: txt_row = [] @@ -324,14 +331,12 @@ def summary(self): f = pval_format txt_row.append(f(v)) data.append(txt_row) - title = 'Parameter Estimates' + title = "Parameter Estimates" table_stubs = list(self.params.index) - header = ['Parameter', 'Std. Err.', 'T-stat', 'P-value', 'Lower CI', 'Upper CI'] - table = SimpleTable(data, - stubs=table_stubs, - txt_fmt=fmt_params, - headers=header, - title=title) + header = ["Parameter", "Std. Err.", "T-stat", "P-value", "Lower CI", "Upper CI"] + table = SimpleTable( + data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title + ) smry.tables.append(table) return smry @@ -345,26 +350,36 @@ def resids(self): These residuals are from the estimated model. They will not have the same shape as the original data whenever the model is estimated on transformed data which has a different shape.""" - return Series(self._resids.squeeze(), index=self._index, name='residual') + return Series(self._resids.squeeze(), index=self._index, name="residual") def _out_of_sample(self, exog, data, fitted, missing): """Interface between model predict and predict for OOS fits""" if exog is not None and data is not None: - raise ValueError('Predictions can only be constructed using one ' - 'of exog or data, but not both.') + raise ValueError( + "Predictions can only be constructed using one " + "of exog or data, but not both." + ) pred = self.model.predict(self.params, exog=exog, data=data) if not missing: pred = pred.loc[pred.notnull().all(1)] return pred - def predict(self, exog=None, *, data=None, fitted=True, effects=False, idiosyncratic=False, - missing=False): + def predict( + self, + exog=None, + *, + data=None, + fitted=True, + effects=False, + idiosyncratic=False, + missing=False + ): """ In- and out-of-sample predictions Parameters ---------- - exog : array-like + exog : array_like Exogenous values to use in out-of-sample prediction (nobs by nexog) data : DataFrame, optional DataFrame to use for out-of-sample predictions when model was @@ -382,7 +397,7 @@ def predict(self, exog=None, *, data=None, fitted=True, effects=False, idiosyncr Returns ------- - predictions : DataFrame + DataFrame DataFrame containing columns for all selected output Notes @@ -407,7 +422,7 @@ def predict(self, exog=None, *, data=None, fitted=True, effects=False, idiosyncr if idiosyncratic: out.append(self.idiosyncratic) if len(out) == 0: - raise ValueError('At least one output must be selected') + raise ValueError("At least one output must be selected") out = concat(out, 1) # type: DataFrame if missing: index = self._original_index @@ -448,7 +463,9 @@ def idiosyncratic(self): @property def wresids(self): """Weighted model residuals""" - return Series(self._wresids.squeeze(), index=self._index, name='weighted residual') + return Series( + self._wresids.squeeze(), index=self._index, name="weighted residual" + ) @property def f_statistic_robust(self): @@ -457,7 +474,7 @@ def f_statistic_robust(self): Returns ------- - f_stat : WaldTestStatistic + WaldTestStatistic Statistic value, distribution and p-value Notes @@ -489,7 +506,7 @@ def f_statistic(self): Returns ------- - f_stat : WaldTestStatistic + WaldTestStatistic Statistic value, distribution and p-value Notes @@ -540,7 +557,7 @@ def wald_test(self, restriction=None, value=None, *, formula=None): Returns ------- - t: WaldTestStatistic + WaldTestStatistic Test statistic for null that restrictions are valid. Notes @@ -579,8 +596,9 @@ def wald_test(self, restriction=None, value=None, *, formula=None): >>> formula = 'union = married = 0' >>> fe_res.wald_test(formula=formula) """ - return quadratic_form_test(self.params, self.cov, restriction=restriction, - value=value, formula=formula) + return quadratic_form_test( + self.params, self.cov, restriction=restriction, value=value, formula=formula + ) class PanelEffectsResults(PanelResults): @@ -608,7 +626,7 @@ def f_pooled(self): Returns ------- - f_pooled : WaldTestStatistic + WaldTestStatistic Statistic value, distribution and p-value Notes @@ -641,13 +659,13 @@ def included_effects(self): if entity_effect or time_effect or other_effect: effects = [] if entity_effect: - effects.append('Entity') + effects.append("Entity") if time_effect: - effects.append('Time') + effects.append("Time") if other_effect: oe = self.model._other_effect_cats.dataframe for c in oe: - effects.append('Other Effect (' + str(c) + ')') + effects.append("Other Effect (" + str(c) + ")") else: effects = [] return effects @@ -673,30 +691,34 @@ def summary(self): smry = super(PanelEffectsResults, self).summary is_invalid = np.isfinite(self.f_pooled.stat) - f_pool = _str(self.f_pooled.stat) if is_invalid else '--' - f_pool_pval = pval_format(self.f_pooled.pval) if is_invalid else '--' - f_pool_name = self.f_pooled.dist_name if is_invalid else '--' + f_pool = _str(self.f_pooled.stat) if is_invalid else "--" + f_pool_pval = pval_format(self.f_pooled.pval) if is_invalid else "--" + f_pool_name = self.f_pooled.dist_name if is_invalid else "--" extra_text = [] if is_invalid: - extra_text.append('F-test for Poolability: {0}'.format(f_pool)) - extra_text.append('P-value: {0}'.format(f_pool_pval)) - extra_text.append('Distribution: {0}'.format(f_pool_name)) - extra_text.append('') + extra_text.append("F-test for Poolability: {0}".format(f_pool)) + extra_text.append("P-value: {0}".format(f_pool_pval)) + extra_text.append("Distribution: {0}".format(f_pool_name)) + extra_text.append("") if self.included_effects: - effects = ', '.join(self.included_effects) - extra_text.append('Included effects: ' + effects) + effects = ", ".join(self.included_effects) + extra_text.append("Included effects: " + effects) if self.other_info is not None: ncol = self.other_info.shape[1] - extra_text.append('Model includes {0} other effects'.format(ncol)) + extra_text.append("Model includes {0} other effects".format(ncol)) for c in self.other_info.T: col = self.other_info.T[c] - extra_text.append('Other Effect {0}:'.format(c)) - stats = 'Avg Obs: {0}, Min Obs: {1}, Max Obs: {2}, Groups: {3}' - stats = stats.format(_str(col['mean']), _str(col['min']), _str(col['max']), - int(col['total'])) + extra_text.append("Other Effect {0}:".format(c)) + stats = "Avg Obs: {0}, Min Obs: {1}, Max Obs: {2}, Groups: {3}" + stats = stats.format( + _str(col["mean"]), + _str(col["min"]), + _str(col["max"]), + int(col["total"]), + ) extra_text.append(stats) smry.add_extra_txt(extra_text) @@ -707,8 +729,8 @@ def summary(self): def variance_decomposition(self): """Decomposition of total variance into effects and residuals""" vals = [self._sigma2_effects, self._sigma2_eps, self._rho] - index = ['Effects', 'Residual', 'Percent due to Effects'] - return Series(vals, index=index, name='Variance Decomposition') + index = ["Effects", "Residual", "Percent due to Effects"] + return Series(vals, index=index, name="Variance Decomposition") class RandomEffectsResults(PanelResults): @@ -727,8 +749,8 @@ def __init__(self, res): def variance_decomposition(self): """Decomposition of total variance into effects and residuals""" vals = [self._sigma2_effects, self._sigma2_eps, self._rho] - index = ['Effects', 'Residual', 'Percent due to Effects'] - return Series(vals, index=index, name='Variance Decomposition') + index = ["Effects", "Residual", "Percent due to Effects"] + return Series(vals, index=index, name="Variance Decomposition") @property def theta(self): @@ -736,7 +758,7 @@ def theta(self): return self._theta -def compare(results, precision='tstats'): +def compare(results, precision="tstats"): """ Compare the results of multiple models @@ -751,7 +773,8 @@ def compare(results, precision='tstats'): Returns ------- - comparison : PanelModelComparison + PanelModelComparison + The model comparison object. """ return PanelModelComparison(results, precision=precision) @@ -769,35 +792,36 @@ class PanelModelComparison(_ModelComparison): Estimator precision estimator to include in the comparison output. Default is 'tstats'. """ + _supported = (PanelEffectsResults, PanelResults, RandomEffectsResults) - def __init__(self, results, *, precision='tstats'): + def __init__(self, results, *, precision="tstats"): super(PanelModelComparison, self).__init__(results, precision=precision) @property def rsquared_between(self): """Coefficients of determination (R**2)""" - return self._get_property('rsquared_between') + return self._get_property("rsquared_between") @property def rsquared_within(self): """Coefficients of determination (R**2)""" - return self._get_property('rsquared_within') + return self._get_property("rsquared_within") @property def rsquared_overall(self): """Coefficients of determination (R**2)""" - return self._get_property('rsquared_overall') + return self._get_property("rsquared_overall") @property def estimator_method(self): """Estimation methods""" - return self._get_property('name') + return self._get_property("name") @property def cov_estimator(self): """Covariance estimator descriptions""" - return self._get_property('_cov_type') + return self._get_property("_cov_type") @property def summary(self): @@ -809,18 +833,38 @@ def summary(self): smry = Summary() models = list(self._results.keys()) - title = 'Model Comparison' - stubs = ['Dep. Variable', 'Estimator', 'No. Observations', 'Cov. Est.', 'R-squared', - 'R-Squared (Within)', 'R-Squared (Between)', 'R-Squared (Overall)', - 'F-statistic', 'P-value (F-stat)'] + title = "Model Comparison" + stubs = [ + "Dep. Variable", + "Estimator", + "No. Observations", + "Cov. Est.", + "R-squared", + "R-Squared (Within)", + "R-Squared (Between)", + "R-Squared (Overall)", + "F-statistic", + "P-value (F-stat)", + ] dep_name = {} for key in self._results: dep_name[key] = self._results[key].model.dependent.vars[0] dep_name = Series(dep_name) - vals = concat([dep_name, self.estimator_method, self.nobs, self.cov_estimator, - self.rsquared, self.rsquared_within, self.rsquared_between, - self.rsquared_overall, self.f_statistic], 1) + vals = concat( + [ + dep_name, + self.estimator_method, + self.nobs, + self.cov_estimator, + self.rsquared, + self.rsquared_within, + self.rsquared_between, + self.rsquared_overall, + self.f_statistic, + ], + 1, + ) vals = [[i for i in v] for v in vals.T.values] vals[2] = [str(v) for v in vals[2]] for i in range(4, len(vals)): @@ -838,11 +882,11 @@ def summary(self): precision_fmt = [] for v in precision.values[i]: v_str = _str(v) - v_str = '({0})'.format(v_str) if v_str.strip() else v_str + v_str = "({0})".format(v_str) if v_str.strip() else v_str precision_fmt.append(v_str) params_fmt.append(precision_fmt) params_stub.append(params.index[i]) - params_stub.append(' ') + params_stub.append(" ") vals = table_concat((vals, params_fmt)) stubs = stub_concat((stubs, params_stub)) @@ -850,32 +894,34 @@ def summary(self): all_effects = [] for key in self._results: res = self._results[key] - effects = getattr(res, 'included_effects', []) + effects = getattr(res, "included_effects", []) all_effects.append(effects) neffect = max(map(len, all_effects)) effects = [] - effects_stub = ['Effects'] + effects_stub = ["Effects"] for i in range(neffect): if i > 0: - effects_stub.append('') + effects_stub.append("") row = [] for j in range(len(self._results)): effect = all_effects[j] if len(effect) > i: row.append(effect[i]) else: - row.append('') + row.append("") effects.append(row) if effects: vals = table_concat((vals, effects)) stubs = stub_concat((stubs, effects_stub)) txt_fmt = default_txt_fmt.copy() - txt_fmt['data_aligns'] = 'r' - txt_fmt['header_align'] = 'r' - table = SimpleTable(vals, headers=models, title=title, stubs=stubs, txt_fmt=txt_fmt) + txt_fmt["data_aligns"] = "r" + txt_fmt["header_align"] = "r" + table = SimpleTable( + vals, headers=models, title=title, stubs=stubs, txt_fmt=txt_fmt + ) smry.tables.append(table) prec_type = self._PRECISION_TYPES[self._precision] - smry.add_extra_txt(['{0} reported in parentheses'.format(prec_type)]) + smry.add_extra_txt(["{0} reported in parentheses".format(prec_type)]) return smry diff --git a/linearmodels/panel/utility.py b/linearmodels/panel/utility.py index aef4621dd1..19cdeacbf0 100644 --- a/linearmodels/panel/utility.py +++ b/linearmodels/panel/utility.py @@ -46,7 +46,7 @@ def preconditioner(d, *, copy=False): """ Parameters ---------- - d : array-like + d : array_like Array to precondition copy : bool Flag indicating whether the operation should be in-place, if possible. @@ -54,12 +54,12 @@ def preconditioner(d, *, copy=False): Returns ------- - d : array-like - Array with same type as input array. If copy is False, and d is - an ndarray or a csc_matrix, then the operation is inplace - cond : ndarray - Array of conditioning numbers defined as the square root of the column - 2-norms (nvar,) + d : array_like + Array with same type as input array. If copy is False, and d is + an ndarray or a csc_matrix, then the operation is inplace + cond : ndarray + Array of conditioning numbers defined as the square root of the column + 2-norms (nvar,) """ # Dense path if not sp.issparse(d): @@ -92,7 +92,9 @@ def preconditioner(d, *, copy=False): return d, cond -def dummy_matrix(cats, *, format='csc', drop='first', drop_all=False, precondition=True): +def dummy_matrix( + cats, *, format="csc", drop="first", drop_all=False, precondition=True +): """ Parameters ---------- @@ -119,7 +121,7 @@ def dummy_matrix(cats, *, format='csc', drop='first', drop_all=False, preconditi Returns ------- - dummies : array-like + dummies : array_like Array, either sparse or dense, of size nobs x ncats containing the dummy variable values cond : ndarray @@ -137,37 +139,43 @@ def dummy_matrix(cats, *, format='csc', drop='first', drop_all=False, preconditi rows = np.arange(nobs) ucats, inverse = np.unique(codes[:, i], return_inverse=True) ncategories = len(ucats) - bits = min([i for i in (8, 16, 32, 64) if i - 1 > np.log2(ncategories + total_dummies)]) - replacements = np.arange(ncategories, dtype='int{:d}'.format(bits)) + bits = min( + [i for i in (8, 16, 32, 64) if i - 1 > np.log2(ncategories + total_dummies)] + ) + replacements = np.arange(ncategories, dtype="int{:d}".format(bits)) cols = replacements[inverse] if i == 0 and not drop_all: retain = np.arange(nobs) - elif drop == 'first': + elif drop == "first": # remove first retain = cols != 0 else: # drop == 'last' # remove last retain = cols != (ncategories - 1) rows = rows[retain] - col_adj = -1 if (drop == 'first' and i > 0) else 0 + col_adj = -1 if (drop == "first" and i > 0) else 0 cols = cols[retain] + total_dummies + col_adj values = np.ones(rows.shape) - data['values'].append(values) - data['rows'].append(rows) - data['cols'].append(cols) + data["values"].append(values) + data["rows"].append(rows) + data["cols"].append(cols) total_dummies += ncategories - (i > 0) - if format in ('csc', 'array'): + if format in ("csc", "array"): fmt = sp.csc_matrix - elif format == 'csr': + elif format == "csr": fmt = sp.csr_matrix - elif format == 'coo': + elif format == "coo": fmt = sp.coo_matrix else: - raise ValueError('Unknown format: {0}'.format(format)) - out = fmt((np.concatenate(data['values']), - (np.concatenate(data['rows']), np.concatenate(data['cols'])))) - if format == 'array': + raise ValueError("Unknown format: {0}".format(format)) + out = fmt( + ( + np.concatenate(data["values"]), + (np.concatenate(data["rows"]), np.concatenate(data["cols"])), + ) + ) + if format == "array": out = out.toarray() if precondition: @@ -310,10 +318,12 @@ def in_2core_graph(cats): def min_dtype(*args): bits = max([np.log2(max(arg.max(), 1)) for arg in args]) - return 'int{0}'.format(min([i for i in (8, 16, 32, 64) if bits < (i - 1)])) + return "int{0}".format(min([i for i in (8, 16, 32, 64) if bits < (i - 1)])) dtype = min_dtype(offset, node_id, count, orig_dest) - meta = np.column_stack([node_id.astype(dtype), count.astype(dtype), offset.astype(dtype)]) + meta = np.column_stack( + [node_id.astype(dtype), count.astype(dtype), offset.astype(dtype)] + ) orig_dest = orig_dest.astype(dtype) singletons = np.any(meta[:, 1] == 1) @@ -388,8 +398,8 @@ def check_absorbed(x: np.ndarray, variables: List[str]): rows = [] for i in range(nabsorbed): vars_idx = np.where(np.abs(absorbed_vecs[:, i]) > tol)[0] - rows.append(' ' * 10 + ', '.join((variables[vi] for vi in vars_idx))) - absorbed_variables = '\n'.join(rows) + rows.append(" " * 10 + ", ".join((variables[vi] for vi in vars_idx))) + absorbed_variables = "\n".join(rows) msg = absorbing_error_msg.format(absorbed_variables=absorbed_variables) raise AbsorbingEffectError(msg) diff --git a/linearmodels/system/__init__.py b/linearmodels/system/__init__.py index faed894de8..e710ee6df5 100644 --- a/linearmodels/system/__init__.py +++ b/linearmodels/system/__init__.py @@ -1,3 +1,3 @@ from linearmodels.system.model import IV3SLS, SUR, IVSystemGMM -__all__ = ['SUR', 'IV3SLS', 'IVSystemGMM'] +__all__ = ["SUR", "IV3SLS", "IVSystemGMM"] diff --git a/linearmodels/system/_utility.py b/linearmodels/system/_utility.py index 52b6c9fea7..67834f95ca 100644 --- a/linearmodels/system/_utility.py +++ b/linearmodels/system/_utility.py @@ -14,7 +14,7 @@ def blocked_column_product(x, s): Returns ------- - bp : ndarray + ndarray Blocked product. k x nobs rows and the number of columns is the same the number of columns as any member of x. """ @@ -39,7 +39,7 @@ def blocked_diag_product(x, s): Returns ------- - bp : ndarray + ndarray Blocked product. k x nobs rows and the number of columns is the same as the total number of columns in x. """ @@ -67,7 +67,7 @@ def blocked_inner_prod(x, s): Returns ------- - ip : ndarray + ndarray Weighted inner product constructed from x and s Notes @@ -135,7 +135,7 @@ def blocked_cross_prod(x, z, s): Returns ------- - xp : ndarray + ndarray Weighted cross product constructed from x and s Notes @@ -182,8 +182,8 @@ def blocked_full_inner_product(x, s): for i in range(k): v = s[i, 0] * x[0:t] for j in range(1, k): - v += s[i, j] * x[j * t:(j + 1) * t] - sx[i * t:(i + 1) * t] = v + v += s[i, j] * x[j * t : (j + 1) * t] + sx[i * t : (i + 1) * t] = v return x.T @ sx @@ -220,20 +220,20 @@ class LinearConstraint(object): def __init__(self, r, q=None, num_params=None, require_pandas=True): if not isinstance(r, (pd.DataFrame, np.ndarray)): - raise TypeError('r must be an array or DataFrame') + raise TypeError("r must be an array or DataFrame") elif require_pandas and not isinstance(r, pd.DataFrame): - raise TypeError('r must be a DataFrame') + raise TypeError("r must be a DataFrame") if r.ndim != 2: - raise ValueError('r must be 2-dimensional') + raise ValueError("r must be 2-dimensional") r_pd = pd.DataFrame(r) ra = np.asarray(r, dtype=np.float64) self._r_pd = r_pd self._ra = ra if q is not None: if require_pandas and not isinstance(q, pd.Series): - raise TypeError('q must be a Series') + raise TypeError("q must be a Series") elif not isinstance(q, (pd.Series, np.ndarray)): - raise TypeError('q must be a Series') + raise TypeError("q must be a Series") q_pd = pd.Series(q, index=r_pd.index) else: q_pd = pd.Series(np.zeros(r_pd.shape[0]), index=r_pd.index) @@ -244,26 +244,27 @@ def __init__(self, r, q=None, num_params=None, require_pandas=True): self._verify_constraints() def __repr__(self): - return self.__str__() + '\nid: ' + str(hex(id(self))) + return self.__str__() + "\nid: " + str(hex(id(self))) def __str__(self): - return 'Linear Constraint with {0} constraints'.format(self._ra.shape[0]) + return "Linear Constraint with {0} constraints".format(self._ra.shape[0]) def _verify_constraints(self): r = self._ra q = self._qa if r.shape[0] != q.shape[0]: - raise ValueError('Constraint inputs are not shape compatible') + raise ValueError("Constraint inputs are not shape compatible") if self._num_params is not None: if r.shape[1] != self._num_params: - raise ValueError('r is incompatible with the number of model ' - 'parameters') + raise ValueError( + "r is incompatible with the number of model " "parameters" + ) rq = np.c_[r, q[:, None]] if not np.all(np.isfinite(rq)) or matrix_rank(rq) < rq.shape[0]: - raise ValueError('Constraints must be non-redundant') + raise ValueError("Constraints must be non-redundant") qr = np.linalg.qr(rq) if matrix_rank(qr[1][:, :-1]) != matrix_rank(qr[1]): - raise ValueError('One or more constraints are infeasible') + raise ValueError("One or more constraints are infeasible") def _compute_transform(self): r = self._ra @@ -274,7 +275,7 @@ def _compute_transform(self): vecs = np.real(vecs) idx = np.argsort(vals)[::-1] vecs = vecs[:, idx] - t, left = vecs[:, :k - c], vecs[:, k - c:] + t, left = vecs[:, : k - c], vecs[:, k - c :] q = self._qa[:, None] a = q.T @ inv(left.T @ r.T) @ left.T self._t, self._l, self._a = t, left, a @@ -291,7 +292,7 @@ def t(self): Returns ------- - t : ndarray + ndarray Constraint transformation matrix Notes @@ -309,7 +310,7 @@ def a(self): Returns ------- - a : ndarray + ndarray Transformed target Notes diff --git a/linearmodels/system/covariance.py b/linearmodels/system/covariance.py index 3722ac751f..d5ed4a11d3 100644 --- a/linearmodels/system/covariance.py +++ b/linearmodels/system/covariance.py @@ -44,7 +44,9 @@ class HomoskedasticCovariance(object): (X'X)^{-1}(X'\Omega X)(X'X)^{-1} """ - def __init__(self, x, eps, sigma, full_sigma, *, gls=False, debiased=False, constraints=None): + def __init__( + self, x, eps, sigma, full_sigma, *, gls=False, debiased=False, constraints=None + ): self._eps = eps self._x = x self._nobs = eps.shape[0] @@ -54,7 +56,7 @@ def __init__(self, x, eps, sigma, full_sigma, *, gls=False, debiased=False, cons self._gls = gls self._debiased = debiased self._constraints = constraints - self._name = 'Homoskedastic (Unadjusted) Covariance' + self._name = "Homoskedastic (Unadjusted) Covariance" self._str_extra = AttrDict(Debiased=self._debiased, GLS=self._gls) self._cov_config = AttrDict(debiased=self._debiased) @@ -62,14 +64,14 @@ def __str__(self): out = self._name extra = [] for key in self._str_extra: - extra.append(': '.join([key, str(self._str_extra[key])])) + extra.append(": ".join([key, str(self._str_extra[key])])) if extra: - out += ' (' + ', '.join(extra) + ')' + out += " (" + ", ".join(extra) + ")" return out def __repr__(self): out = self.__str__() - return out + ', id: {0}'.format(hex(id(self))) + return out + ", id: {0}".format(hex(id(self))) @property def sigma(self): @@ -174,12 +176,19 @@ class HeteroskedasticCovariance(HomoskedasticCovariance): where :math:`\hat{S}` is a estimator of the covariance of the model scores. """ - def __init__(self, x, eps, sigma, full_sigma, gls=False, debiased=False, constraints=None): - super(HeteroskedasticCovariance, self).__init__(x, eps, sigma, full_sigma, - gls=gls, - debiased=debiased, - constraints=constraints) - self._name = 'Heteroskedastic (Robust) Covariance' + def __init__( + self, x, eps, sigma, full_sigma, gls=False, debiased=False, constraints=None + ): + super(HeteroskedasticCovariance, self).__init__( + x, + eps, + sigma, + full_sigma, + gls=gls, + debiased=debiased, + constraints=constraints, + ) + self._name = "Heteroskedastic (Robust) Covariance" k = len(x) nobs = eps.shape[0] @@ -200,7 +209,7 @@ def __init__(self, x, eps, sigma, full_sigma, gls=False, debiased=False, constra loc = 0 for i in range(k): offset = x[i].shape[1] - xe[:, loc:loc+offset] = x[i] * eps[:, i:i+1] + xe[:, loc : loc + offset] = x[i] * eps[:, i : i + 1] loc += offset self._moments = xe @@ -305,17 +314,34 @@ class KernelCovariance(HeteroskedasticCovariance, _HACMixin): linearmodels.iv.covariance.kernel_weight_quadratic_spectral """ - def __init__(self, x, eps, sigma, full_sigma, *, gls=False, debiased=False, constraints=None, - kernel='bartlett', bandwidth=None): - super(KernelCovariance, self).__init__(x, eps, sigma, full_sigma, gls=gls, - debiased=debiased, - constraints=constraints) + def __init__( + self, + x, + eps, + sigma, + full_sigma, + *, + gls=False, + debiased=False, + constraints=None, + kernel="bartlett", + bandwidth=None + ): + super(KernelCovariance, self).__init__( + x, + eps, + sigma, + full_sigma, + gls=gls, + debiased=debiased, + constraints=constraints, + ) self._check_kernel(kernel) self._check_bandwidth(bandwidth) - self._name = 'Kernel (HAC) Covariance' - self._str_extra['Kernel'] = kernel - self._cov_config['kernel'] = kernel + self._name = "Kernel (HAC) Covariance" + self._str_extra["Kernel"] = kernel + self._cov_config["kernel"] = kernel def _xeex(self): return self._kernel_cov(self._moments) @@ -324,7 +350,7 @@ def _xeex(self): def cov_config(self): """Optional configuration information used in covariance""" out = AttrDict([(k, v) for k, v in self._cov_config.items()]) - out['bandwidth'] = self.bandwidth + out["bandwidth"] = self.bandwidth return out @@ -368,7 +394,7 @@ def __init__(self, x, z, eps, w, *, sigma=None, debiased=False, constraints=None self._w = w self._debiased = debiased self._constraints = constraints - self._name = 'GMM Homoskedastic (Unadjusted) Covariance' + self._name = "GMM Homoskedastic (Unadjusted) Covariance" self._cov_config = AttrDict(debiased=self._debiased) def __str__(self): @@ -377,7 +403,7 @@ def __str__(self): def __repr__(self): out = self.__str__() - return out + ', id: {0}'.format(hex(id(self))) + return out + ", id: {0}".format(hex(id(self))) @property def cov(self): @@ -401,7 +427,14 @@ def cov(self): xpz_wi_zpx = cons.t.T @ xpz_wi_zpx @ cons.t xpz_wi_zpxi = inv(xpz_wi_zpx) xpz_wi_omega_wi_zpx = cons.t.T @ xpz_wi_omega_wi_zpx @ cons.t - cov = cons.t @ xpz_wi_zpxi @ xpz_wi_omega_wi_zpx @ xpz_wi_zpxi @ cons.t.T / nobs + cov = ( + cons.t + @ xpz_wi_zpxi + @ xpz_wi_omega_wi_zpx + @ xpz_wi_zpxi + @ cons.t.T + / nobs + ) cov = (cov + cov.T) / 2 return adj * cov @@ -465,8 +498,10 @@ class GMMHeteroskedasticCovariance(GMMHomoskedasticCovariance): """ def __init__(self, x, z, eps, w, *, sigma=None, debiased=False, constraints=None): - super().__init__(x, z, eps, w, sigma=sigma, debiased=debiased, constraints=constraints) - self._name = 'GMM Heteroskedastic (Robust) Covariance' + super().__init__( + x, z, eps, w, sigma=sigma, debiased=debiased, constraints=constraints + ) + self._name = "GMM Heteroskedastic (Robust) Covariance" k = len(z) k_total = sum(map(lambda a: a.shape[1], z)) @@ -475,7 +510,7 @@ def __init__(self, x, z, eps, w, *, sigma=None, debiased=False, constraints=None ze = empty((nobs, k_total)) for i in range(k): kz = z[i].shape[1] - ze[:, loc:loc + kz] = z[i] * eps[:, [i]] + ze[:, loc : loc + kz] = z[i] * eps[:, [i]] loc += kz self._moments = ze @@ -528,13 +563,26 @@ class GMMKernelCovariance(GMMHeteroskedasticCovariance, _HACMixin): where :math:`\Omega` is the covariance of the moment conditions. """ - def __init__(self, x, z, eps, w, *, sigma=None, debiased=False, constraints=None, - kernel='bartlett', bandwidth=None): - super().__init__(x, z, eps, w, sigma=sigma, debiased=debiased, constraints=constraints) - self._name = 'GMM Kernel (HAC) Covariance' + def __init__( + self, + x, + z, + eps, + w, + *, + sigma=None, + debiased=False, + constraints=None, + kernel="bartlett", + bandwidth=None + ): + super().__init__( + x, z, eps, w, sigma=sigma, debiased=debiased, constraints=constraints + ) + self._name = "GMM Kernel (HAC) Covariance" self._check_bandwidth(bandwidth) self._check_kernel(kernel) - self._cov_config['kernel'] = kernel + self._cov_config["kernel"] = kernel def _omega(self): return self._kernel_cov(self._moments) @@ -543,5 +591,5 @@ def _omega(self): def cov_config(self): """Optional configuration information used in covariance""" out = AttrDict([(k, v) for k, v in self._cov_config.items()]) - out['bandwidth'] = self.bandwidth + out["bandwidth"] = self.bandwidth return out diff --git a/linearmodels/system/gmm.py b/linearmodels/system/gmm.py index 13e54ed768..4449a3beae 100644 --- a/linearmodels/system/gmm.py +++ b/linearmodels/system/gmm.py @@ -40,20 +40,20 @@ def __init__(self, center=False, debiased=False): self._center = center self._debiased = debiased self._bandwidth = 0 - self._name = 'Homoskedastic (Unadjusted) Weighting' + self._name = "Homoskedastic (Unadjusted) Weighting" self._config = AttrDict(center=center, debiased=debiased) def __str__(self): out = self._name extra = [] for key in self._str_extra: - extra.append(': '.join([key, str(self._str_extra[key])])) + extra.append(": ".join([key, str(self._str_extra[key])])) if extra: - out += ' (' + ', '.join(extra) + ')' + out += " (" + ", ".join(extra) + ")" return out def __repr__(self): - return self.__str__() + ', id: {0}'.format(hex(id(self))) + return self.__str__() + ", id: {0}".format(hex(id(self))) @property def _str_extra(self): @@ -104,7 +104,7 @@ def weight_matrix(self, x, z, eps, *, sigma=None): Returns ------- - weight : ndarray + ndarray Covariance of GMM moment conditions. """ nobs = z[0].shape[0] @@ -118,7 +118,7 @@ def config(self): Returns ------- - config : AttrDict + AttrDict Dictionary containing weight estimator configuration information """ return self._config @@ -154,7 +154,7 @@ class HeteroskedasticWeightMatrix(HomoskedasticWeightMatrix): def __init__(self, center=False, debiased=False): super(HeteroskedasticWeightMatrix, self).__init__(center, debiased) - self._name = 'Heteroskedastic (Robust) Weighting' + self._name = "Heteroskedastic (Robust) Weighting" def weight_matrix(self, x, z, eps, *, sigma=None): """ @@ -171,7 +171,7 @@ def weight_matrix(self, x, z, eps, *, sigma=None): Returns ------- - weight : ndarray + ndarray Covariance of GMM moment conditions. """ nobs = x[0].shape[0] @@ -182,7 +182,7 @@ def weight_matrix(self, x, z, eps, *, sigma=None): for i in range(k): e = eps[:, [i]] zk = z[i].shape[1] - ze[:, loc:loc + zk] = z[i] * e + ze[:, loc : loc + zk] = z[i] * e loc += zk mu = ze.mean(axis=0) if self._center else 0 ze -= mu @@ -247,10 +247,16 @@ class KernelWeightMatrix(HeteroskedasticWeightMatrix, _HACMixin): between the moment conditions. """ - def __init__(self, center=False, debiased=False, kernel='bartlett', bandwidth=None, - optimal_bw=False): + def __init__( + self, + center=False, + debiased=False, + kernel="bartlett", + bandwidth=None, + optimal_bw=False, + ): super(KernelWeightMatrix, self).__init__(center, debiased) - self._name = 'Kernel (HAC) Weighting' + self._name = "Kernel (HAC) Weighting" self._check_kernel(kernel) self._check_bandwidth(bandwidth) self._predefined_bw = self._bandwidth @@ -271,7 +277,7 @@ def weight_matrix(self, x, z, eps, *, sigma=None): Returns ------- - weight : ndarray + ndarray Covariance of GMM moment conditions. """ nobs = x[0].shape[0] @@ -282,7 +288,7 @@ def weight_matrix(self, x, z, eps, *, sigma=None): for i in range(k): e = eps[:, [i]] zk = z[i].shape[1] - ze[:, loc:loc + zk] = z[i] * e + ze[:, loc : loc + zk] = z[i] * e loc += zk mu = ze.mean(axis=0) if self._center else 0 ze -= mu @@ -317,9 +323,9 @@ def config(self): Returns ------- - config : AttrDict + AttrDict Dictionary containing weight estimator configuration information """ out = AttrDict([(k, v) for k, v in self._config.items()]) - out['bandwidth'] = self.bandwidth + out["bandwidth"] = self.bandwidth return out diff --git a/linearmodels/system/model.py b/linearmodels/system/model.py index 660d979950..1c6aa9e0a0 100644 --- a/linearmodels/system/model.py +++ b/linearmodels/system/model.py @@ -42,7 +42,7 @@ WaldTestStatistic, has_constant, missing_warning) -__all__ = ['SUR', 'IV3SLS', 'IVSystemGMM'] +__all__ = ["SUR", "IV3SLS", "IVSystemGMM"] UNKNOWN_EQ_TYPE = """ Contents of each equation must be either a dictionary with keys 'dependent' @@ -50,24 +50,32 @@ equations[{key}] was {type} """ -COV_TYPES = {'unadjusted': 'unadjusted', - 'homoskedastic': 'unadjusted', - 'robust': 'robust', - 'heteroskedastic': 'robust', - 'kernel': 'kernel', - 'hac': 'kernel'} - -COV_EST = {'unadjusted': HomoskedasticCovariance, - 'robust': HeteroskedasticCovariance, - 'kernel': KernelCovariance} - -GMM_W_EST = {'unadjusted': HomoskedasticWeightMatrix, - 'robust': HeteroskedasticWeightMatrix, - 'kernel': KernelWeightMatrix} - -GMM_COV_EST = {'unadjusted': GMMHomoskedasticCovariance, - 'robust': GMMHeteroskedasticCovariance, - 'kernel': GMMKernelCovariance} +COV_TYPES = { + "unadjusted": "unadjusted", + "homoskedastic": "unadjusted", + "robust": "robust", + "heteroskedastic": "robust", + "kernel": "kernel", + "hac": "kernel", +} + +COV_EST = { + "unadjusted": HomoskedasticCovariance, + "robust": HeteroskedasticCovariance, + "kernel": KernelCovariance, +} + +GMM_W_EST = { + "unadjusted": HomoskedasticWeightMatrix, + "robust": HeteroskedasticWeightMatrix, + "kernel": KernelWeightMatrix, +} + +GMM_COV_EST = { + "unadjusted": GMMHomoskedasticCovariance, + "robust": GMMHeteroskedasticCovariance, + "kernel": GMMKernelCovariance, +} def _to_ordered_dict(equations): @@ -86,7 +94,8 @@ def _missing_weights(weights): missing = [key for key in weights if weights[key] is None] if missing: import warnings - msg = 'Weights not found for equation labels:\n{0}'.format(', '.join(missing)) + + msg = "Weights not found for equation labels:\n{0}".format(", ".join(missing)) warnings.warn(msg, UserWarning) return None @@ -135,7 +144,7 @@ def _parameters_from_xprod(xpx, xpy, constraints=None): class SystemFormulaParser(object): def __init__(self, formula, data, weights=None, eval_env=6): if not isinstance(formula, (Mapping, str)): - raise TypeError('formula must be a string or dictionary-like') + raise TypeError("formula must be a string or dictionary-like") self._formula = formula self._data = data self._weights = weights @@ -147,8 +156,8 @@ def __init__(self, formula, data, weights=None, eval_env=6): @staticmethod def _prevent_autoconst(formula): - if not (' 0+' in formula or ' 0 +' in formula): - formula = '~ 0 +'.join(formula.split('~')) + if not (" 0+" in formula or " 0 +" in formula): + formula = "~ 0 +".join(formula.split("~")) return formula def _parse(self): @@ -172,24 +181,24 @@ def _parse(self): weight_dict[key] = None cln_formula[key] = f else: - formula = formula.replace('\n', ' ').strip() - parts = formula.split('}') + formula = formula.replace("\n", " ").strip() + parts = formula.split("}") for part in parts: key = base_key = None part = part.strip() - if part == '': + if part == "": continue - part = part.replace('{', '') - if ':' in part.split('~')[0]: - base_key, part = part.split(':') + part = part.replace("{", "") + if ":" in part.split("~")[0]: + base_key, part = part.split(":") key = base_key = base_key.strip() part = part.strip() f = self._prevent_autoconst(part) if base_key is None: - base_key = key = f.split('~')[0].strip() + base_key = key = f.split("~")[0].strip() count = 0 while key in parsers: - key = base_key + '.{0}'.format(count) + key = base_key + ".{0}".format(count) count += 1 parsers[key] = IVFormulaParser(f, data, eval_env=self._eval_env) cln_formula[key] = f @@ -202,7 +211,9 @@ def _parse(self): self._weight_dict = weight_dict def _get_variable(self, variable): - return OrderedDict([(key, getattr(self._parsers[key], variable)) for key in self._parsers]) + return OrderedDict( + [(key, getattr(self._parsers[key], variable)) for key in self._parsers] + ) @property def formula(self): @@ -222,7 +233,9 @@ def eval_env(self, value): new_parsers = OrderedDict() for key in parsers: parser = parsers[key] - new_parsers[key] = IVFormulaParser(parser._formula, parser._data, self._eval_env) + new_parsers[key] = IVFormulaParser( + parser._formula, parser._data, self._eval_env + ) self._parsers = new_parsers @property @@ -234,36 +247,36 @@ def data(self): out = OrderedDict() dep = self.dependent for key in dep: - out[key] = {'dependent': dep[key]} + out[key] = {"dependent": dep[key]} exog = self.exog for key in exog: - out[key]['exog'] = exog[key] + out[key]["exog"] = exog[key] endog = self.endog for key in endog: - out[key]['endog'] = endog[key] + out[key]["endog"] = endog[key] instr = self.instruments for key in instr: - out[key]['instruments'] = instr[key] + out[key]["instruments"] = instr[key] for key in self._weight_dict: if self._weight_dict[key] is not None: - out[key]['weights'] = self._weight_dict[key] + out[key]["weights"] = self._weight_dict[key] return out @property def dependent(self): - return self._get_variable('dependent') + return self._get_variable("dependent") @property def exog(self): - return self._get_variable('exog') + return self._get_variable("exog") @property def endog(self): - return self._get_variable('endog') + return self._get_variable("endog") @property def instruments(self): - return self._get_variable('instruments') + return self._get_variable("instruments") class IV3SLS(object): @@ -287,7 +300,7 @@ class IV3SLS(object): equation contains no exogenous regressors. Similarly 'endog' and 'instruments' can either be omitted or may contain an empty array (or `None`) if all variables in an equation are exogenous. - sigma : array-like + sigma : array_like Prespecified residual covariance to use in GLS estimation. If not provided, FGLS is implemented based on an estimate of sigma. @@ -352,10 +365,10 @@ class IV3SLS(object): def __init__(self, equations, *, sigma=None): if not isinstance(equations, Mapping): - raise TypeError('equations must be a dictionary-like') + raise TypeError("equations must be a dictionary-like") for key in equations: if not isinstance(key, str): - raise ValueError('Equation labels (keys) must be strings') + raise ValueError("Equation labels (keys) must be strings") # Ensure nearly deterministic equation ordering equations = _to_ordered_dict(equations) @@ -366,8 +379,10 @@ def __init__(self, equations, *, sigma=None): self._sigma = np.asarray(sigma) k = len(self._equations) if self._sigma.shape != (k, k): - raise ValueError('sigma must be a square matrix with dimensions ' - 'equal to the number of equations') + raise ValueError( + "sigma must be a square matrix with dimensions " + "equal to the number of equations" + ) self._param_names = [] self._eq_labels = [] self._dependent = [] @@ -390,7 +405,7 @@ def __init__(self, equations, *, sigma=None): self._has_constant = None self._common_exog = False self._original_index = None - self._model_name = 'Three Stage Least Squares (3SLS)' + self._model_name = "Three Stage Least Squares (3SLS)" self._validate_data() @@ -408,22 +423,26 @@ def _validate_data(self): for i, key in enumerate(self._equations): self._eq_labels.append(key) eq_data = self._equations[key] - dep_name = 'dependent_' + str(i) - exog_name = 'exog_' + str(i) - endog_name = 'endog_' + str(i) - instr_name = 'instr_' + str(i) + dep_name = "dependent_" + str(i) + exog_name = "exog_" + str(i) + endog_name = "endog_" + str(i) + instr_name = "instr_" + str(i) if isinstance(eq_data, (tuple, list)): dep = IVData(eq_data[0], var_name=dep_name) self._dependent.append(dep) current_id = id(eq_data[1]) - self._exog.append(IVData(eq_data[1], var_name=exog_name, nobs=dep.shape[0])) + self._exog.append( + IVData(eq_data[1], var_name=exog_name, nobs=dep.shape[0]) + ) endog = IVData(eq_data[2], var_name=endog_name, nobs=dep.shape[0]) if endog.shape[1] > 0: current_id = (current_id, id(eq_data[2])) ids.append(current_id) self._endog.append(endog) - self._instr.append(IVData(eq_data[3], var_name=instr_name, nobs=dep.shape[0])) + self._instr.append( + IVData(eq_data[3], var_name=instr_name, nobs=dep.shape[0]) + ) if len(eq_data) == 5: self._weights.append(IVData(eq_data[4])) else: @@ -431,26 +450,26 @@ def _validate_data(self): self._weights.append(IVData(np.ones_like(dep))) elif isinstance(eq_data, (dict, Mapping)): - dep = IVData(eq_data['dependent'], var_name=dep_name) + dep = IVData(eq_data["dependent"], var_name=dep_name) self._dependent.append(dep) - exog = eq_data.get('exog', None) + exog = eq_data.get("exog", None) self._exog.append(IVData(exog, var_name=exog_name, nobs=dep.shape[0])) current_id = id(exog) - endog = eq_data.get('endog', None) + endog = eq_data.get("endog", None) endog = IVData(endog, var_name=endog_name, nobs=dep.shape[0]) self._endog.append(endog) - if 'endog' in eq_data: - current_id = (current_id, id(eq_data['endog'])) + if "endog" in eq_data: + current_id = (current_id, id(eq_data["endog"])) ids.append(current_id) - instr = eq_data.get('instruments', None) + instr = eq_data.get("instruments", None) instr = IVData(instr, var_name=instr_name, nobs=dep.shape[0]) self._instr.append(instr) - if 'weights' in eq_data: - self._weights.append(IVData(eq_data['weights'])) + if "weights" in eq_data: + self._weights.append(IVData(eq_data["weights"])) else: self._weights.append(IVData(np.ones(dep.shape))) else: @@ -460,14 +479,17 @@ def _validate_data(self): for instr in self._instr: self._has_instruments = self._has_instruments or (instr.shape[1] > 1) - for i, comps in enumerate(zip(self._dependent, self._exog, self._endog, self._instr, - self._weights)): + for i, comps in enumerate( + zip(self._dependent, self._exog, self._endog, self._instr, self._weights) + ): shapes = list(map(lambda a: a.shape[0], comps)) if min(shapes) != max(shapes): - raise ValueError('Dependent, exogenous, endogenous and ' - 'instruments, and weights, if provided, do ' - 'not have the same number of observations in ' - '{eq}'.format(eq=self._eq_labels[i])) + raise ValueError( + "Dependent, exogenous, endogenous and " + "instruments, and weights, if provided, do " + "not have the same number of observations in " + "{eq}".format(eq=self._eq_labels[i]) + ) self._drop_missing() self._common_exog = len(set(ids)) == 1 @@ -479,9 +501,14 @@ def _validate_data(self): constant = [] constant_loc = [] - for dep, exog, endog, instr, w, label in zip(self._dependent, self._exog, self._endog, - self._instr, self._weights, - self._eq_labels): + for dep, exog, endog, instr, w, label in zip( + self._dependent, + self._exog, + self._endog, + self._instr, + self._weights, + self._eq_labels, + ): y = dep.ndarray x = np.concatenate([exog.ndarray, endog.ndarray], 1) z = np.concatenate([exog.ndarray, instr.ndarray], 1) @@ -496,29 +523,38 @@ def _validate_data(self): self._wx.append(x * w_sqrt) self._wz.append(z * w_sqrt) cols = list(exog.cols) + list(endog.cols) - self._param_names.extend([label + '_' + col for col in cols]) + self._param_names.extend([label + "_" + col for col in cols]) if y.shape[0] <= x.shape[1]: - raise ValueError('Fewer observations than variables in ' - 'equation {eq}'.format(eq=label)) + raise ValueError( + "Fewer observations than variables in " + "equation {eq}".format(eq=label) + ) if matrix_rank(x) < x.shape[1]: - raise ValueError('Equation {eq} regressor array is not full ' - 'rank'.format(eq=label)) + raise ValueError( + "Equation {eq} regressor array is not full " "rank".format(eq=label) + ) if x.shape[1] > z.shape[1]: - raise ValueError('Equation {eq} has fewer instruments than ' - 'endogenous variables.'.format(eq=label)) + raise ValueError( + "Equation {eq} has fewer instruments than " + "endogenous variables.".format(eq=label) + ) if z.shape[1] > z.shape[0]: - raise ValueError('Fewer observations than instruments in ' - 'equation {eq}'.format(eq=label)) + raise ValueError( + "Fewer observations than instruments in " + "equation {eq}".format(eq=label) + ) if matrix_rank(z) < z.shape[1]: - raise ValueError('Equation {eq} instrument array is full ' - 'rank'.format(eq=label)) + raise ValueError( + "Equation {eq} instrument array is full " "rank".format(eq=label) + ) for rhs in self._x: const, const_loc = has_constant(rhs) constant.append(const) constant_loc.append(const_loc) - self._has_constant = Series(constant, - index=[d.cols[0] for d in self._dependent]) + self._has_constant = Series( + constant, index=[d.cols[0] for d in self._dependent] + ) self._constant_loc = constant_loc def _drop_missing(self): @@ -544,15 +580,15 @@ def _drop_missing(self): self._weights[i].drop(missing) def __repr__(self): - return self.__str__() + '\nid: {0}'.format(hex(id(self))) + return self.__str__() + "\nid: {0}".format(hex(id(self))) def __str__(self): - out = self._model_name + ', ' - out += '{0} Equations:\n'.format(len(self._y)) - eqns = ', '.join(self._equations.keys()) - out += '\n'.join(textwrap.wrap(eqns, 70)) + out = self._model_name + ", " + out += "{0} Equations:\n".format(len(self._y)) + eqns = ", ".join(self._equations.keys()) + out += "\n".join(textwrap.wrap(eqns, 70)) if self._common_exog: - out += '\nCommon Exogenous Variables' + out += "\nCommon Exogenous Variables" return out def predict(self, params, *, equations=None, data=None, eval_env=8): @@ -561,7 +597,7 @@ def predict(self, params, *, equations=None, data=None, eval_env=8): Parameters ---------- - params : array-like + params : array_like Model parameters (nvar by 1) equations : dict Dictionary-like structure containing exogenous and endogenous @@ -608,10 +644,10 @@ def predict(self, params, *, equations=None, data=None, eval_env=8): for i, label in enumerate(self._eq_labels): kx = self._x[i].shape[1] if label in equations: - b = params[loc:loc + kx] + b = params[loc : loc + kx] eqn = equations[label] # type: dict - exog = eqn.get('exog', None) - endog = eqn.get('endog', None) + exog = eqn.get("exog", None) + endog = eqn.get("endog", None) if exog is None and endog is None: loc += kx continue @@ -628,13 +664,25 @@ def predict(self, params, *, equations=None, data=None, eval_env=8): fitted = DataFrame(fitted, index=exog_endog.index, columns=[label]) out[label] = fitted loc += kx - out = reduce(lambda left, right: left.merge(right, how='outer', - left_index=True, right_index=True), - [out[key] for key in out]) + out = reduce( + lambda left, right: left.merge( + right, how="outer", left_index=True, right_index=True + ), + [out[key] for key in out], + ) return out - def fit(self, *, method=None, full_cov=True, iterate=False, iter_limit=100, tol=1e-6, - cov_type='robust', **cov_config): + def fit( + self, + *, + method=None, + full_cov=True, + iterate=False, + iter_limit=100, + tol=1e-6, + cov_type="robust", + **cov_config + ): """ Estimate model parameters @@ -677,9 +725,14 @@ def fit(self, *, method=None, full_cov=True, iterate=False, iter_limit=100, tol= linearmodels.system.covariance.HeteroskedasticCovariance linearmodels.system.covariance.KernelCovariance """ + if method is None: + method = ( + "ols" if (self._common_exog and self._constraints is None) else "gls" + ) + cov_type = cov_type.lower() if cov_type not in COV_TYPES: - raise ValueError('Unknown cov_type: {0}'.format(cov_type)) + raise ValueError("Unknown cov_type: {0}".format(cov_type)) cov_type = COV_TYPES[cov_type] k = len(self._dependent) col_sizes = [0] + list(map(lambda v: v.shape[1], self._x)) @@ -688,18 +741,24 @@ def fit(self, *, method=None, full_cov=True, iterate=False, iter_limit=100, tol= self._construct_xhat() beta, eps = self._multivariate_ls_fit() nobs = eps.shape[0] - debiased = cov_config.get('debiased', False) + debiased = cov_config.get("debiased", False) full_sigma = sigma = (eps.T @ eps / nobs) * self._sigma_scale(debiased) - if (self._common_exog and method is None and self._constraints is None) or method == 'ols': - return self._multivariate_ls_finalize(beta, eps, sigma, cov_type, **cov_config) + + if method == "ols": + return self._multivariate_ls_finalize( + beta, eps, sigma, col_idx, total_cols, cov_type, **cov_config + ) beta_hist = [beta] nobs = eps.shape[0] iter_count = 0 delta = np.inf - while ((iter_count < iter_limit and iterate) or iter_count == 0) and delta >= tol: - beta, eps, sigma = self._gls_estimate(eps, nobs, total_cols, col_idx, - full_cov, debiased) + while ( + (iter_count < iter_limit and iterate) or iter_count == 0 + ) and delta >= tol: + beta, eps, sigma, est_sigma = self._gls_estimate( + eps, nobs, total_cols, col_idx, full_cov, debiased + ) beta_hist.append(beta) delta = beta_hist[-1] - beta_hist[-2] delta = np.sqrt(np.mean(delta ** 2)) @@ -714,8 +773,18 @@ def fit(self, *, method=None, full_cov=True, iterate=False, iter_limit=100, tol= x = blocked_diag_product(self._x, np.eye(k)) eps = y - x @ beta - return self._gls_finalize(beta, sigma, full_sigma, gls_eps, - eps, cov_type, iter_count, **cov_config) + return self._gls_finalize( + beta, + sigma, + full_sigma, + est_sigma, + gls_eps, + eps, + full_cov, + cov_type, + iter_count, + **cov_config + ) def _multivariate_ls_fit(self): wy, wx, wxhat = self._wy, self._wx, self._wxhat @@ -732,7 +801,7 @@ def _multivariate_ls_fit(self): eps = [] for i in range(k): nb = wx[i].shape[1] - b = beta[loc:loc + nb] + b = beta[loc : loc + nb] eps.append(wy[i] - wx[i] @ b) loc += nb eps = np.hstack(eps) @@ -763,6 +832,7 @@ def _gls_estimate(self, eps, nobs, total_cols, ci, full_cov, debiased): if sigma is None: sigma = eps.T @ eps / nobs sigma *= self._sigma_scale(debiased) + est_sigma = sigma if not full_cov: sigma = np.diag(np.diag(sigma)) @@ -776,7 +846,7 @@ def _gls_estimate(self, eps, nobs, total_cols, ci, full_cov, debiased): sy = np.zeros((nobs, 1)) for j in range(k): sy += sigma_inv[i, j] * wy[j] - xpy[ci[i]:ci[i + 1]] = wxhat[i].T @ sy + xpy[ci[i] : ci[i + 1]] = wxhat[i].T @ sy beta = _parameters_from_xprod(xpx, xpy, constraints=self.constraints) @@ -785,22 +855,31 @@ def _gls_estimate(self, eps, nobs, total_cols, ci, full_cov, debiased): _wx = wx[j] _wy = wy[j] kx = _wx.shape[1] - eps[:, [j]] = _wy - _wx @ beta[loc:loc + kx] + eps[:, [j]] = _wy - _wx @ beta[loc : loc + kx] loc += kx - return beta, eps, sigma + return beta, eps, sigma, est_sigma - def _multivariate_ls_finalize(self, beta, eps, sigma, cov_type, **cov_config): + def _multivariate_ls_finalize( + self, beta, eps, sigma, col_idx, total_cols, cov_type, **cov_config + ): k = len(self._wx) # Covariance estimation cov_est = COV_EST[cov_type] - cov_est = cov_est(self._wxhat, eps, sigma, sigma, gls=False, - constraints=self._constraints, **cov_config) + cov_est = cov_est( + self._wxhat, + eps, + sigma, + sigma, + gls=False, + constraints=self._constraints, + **cov_config + ) cov = cov_est.cov individual = AttrDict() - debiased = cov_config.get('debiased', False) + debiased = cov_config.get("debiased", False) for i in range(k): wy = wye = self._wy[i] w = self._w[i] @@ -809,17 +888,33 @@ def _multivariate_ls_finalize(self, beta, eps, sigma, cov_type, **cov_config): wc = np.ones_like(wy) * np.sqrt(w) wye = wy - wc @ lstsq(wc, wy)[0] total_ss = float(wye.T @ wye) - stats = self._common_indiv_results(i, beta, cov, eps, eps, 'OLS', - cov_type, cov_est, 0, debiased, cons, total_ss) + stats = self._common_indiv_results( + i, + beta, + cov, + eps, + eps, + "OLS", + cov_type, + cov_est, + 0, + debiased, + cons, + total_ss, + ) key = self._eq_labels[i] individual[key] = stats nobs = eps.size - results = self._common_results(beta, cov, 'OLS', 0, nobs, cov_type, - sigma, individual, debiased) - results['wresid'] = results.resid - results['cov_estimator'] = cov_est - results['cov_config'] = cov_est.cov_config + results = self._common_results( + beta, cov, "OLS", 0, nobs, cov_type, sigma, individual, debiased + ) + results["wresid"] = results.resid + results["cov_estimator"] = cov_est + results["cov_config"] = cov_est.cov_config + individual = results["individual"] + r2s = [individual[eq].r2 for eq in individual] + results["system_r2"] = self._system_r2(eps, sigma, "ols", False, debiased, r2s) return SystemResults(results) @@ -835,13 +930,13 @@ def multivariate_ls(cls, dependent, exog=None, endog=None, instruments=None): Parameters ---------- - dependent : array-like + dependent : array_like nobs by ndep array of dependent variables - exog : array-like, optional + exog : array_like, optional nobs by nexog array of exogenous regressors common to all models - endog : array-like, optional + endog : array_like, optional nobs by nendog array of endogenous regressors common to all models - instruments : array-like, optional + instruments : array_like, optional nobs by ninstr array of instruments to use in all equations Returns @@ -859,14 +954,19 @@ def multivariate_ls(cls, dependent, exog=None, endog=None, instruments=None): exogenous, endogenous and instrumental variables. """ equations = OrderedDict() - dependent = IVData(dependent, var_name='dependent') + dependent = IVData(dependent, var_name="dependent") if exog is None and endog is None: - raise ValueError('At least one of exog or endog must be provided') - exog = IVData(exog, var_name='exog') - endog = IVData(endog, var_name='endog', nobs=dependent.shape[0]) - instr = IVData(instruments, var_name='instruments', nobs=dependent.shape[0]) + raise ValueError("At least one of exog or endog must be provided") + exog = IVData(exog, var_name="exog") + endog = IVData(endog, var_name="endog", nobs=dependent.shape[0]) + instr = IVData(instruments, var_name="instruments", nobs=dependent.shape[0]) for col in dependent.pandas: - equations[col] = (dependent.pandas[[col]], exog.pandas, endog.pandas, instr.pandas) + equations[col] = ( + dependent.pandas[[col]], + exog.pandas, + endog.pandas, + instr.pandas, + ) return cls(equations) @classmethod @@ -882,7 +982,7 @@ def from_formula(cls, formula, data, *, sigma=None, weights=None): description of the accepted syntax data : DataFrame Frame containing named variables - sigma : array-like + sigma : array_like Prespecified residual covariance to use in GLS estimation. If not provided, FGLS is implemented based on an estimate of sigma. weights : dict-like @@ -942,96 +1042,114 @@ def _f_stat(self, stats, debiased): params = stats.params[sel] df = params.shape[0] nobs = stats.nobs - null = 'All parameters ex. constant are zero' - name = 'Equation F-statistic' + null = "All parameters ex. constant are zero" + name = "Equation F-statistic" try: stat = float(params.T @ inv(cov) @ params) except np.linalg.LinAlgError: - return InvalidTestStatistic('Covariance is singular, possibly due ' - 'to constraints.', name=name) + return InvalidTestStatistic( + "Covariance is singular, possibly due " "to constraints.", name=name + ) if debiased: total_reg = np.sum(list(map(lambda s: s.shape[1], self._wx))) df_denom = len(self._wx) * nobs - total_reg - wald = WaldTestStatistic(stat / df, null, df, df_denom=df_denom, - name=name) + wald = WaldTestStatistic(stat / df, null, df, df_denom=df_denom, name=name) else: return WaldTestStatistic(stat, null=null, df=df, name=name) return wald - def _common_indiv_results(self, index, beta, cov, wresid, resid, method, - cov_type, cov_est, iter_count, debiased, constant, total_ss, - *, weight_est=None): + def _common_indiv_results( + self, + index, + beta, + cov, + wresid, + resid, + method, + cov_type, + cov_est, + iter_count, + debiased, + constant, + total_ss, + *, + weight_est=None + ): loc = 0 for i in range(index): loc += self._wx[i].shape[1] i = index stats = AttrDict() # Static properties - stats['eq_label'] = self._eq_labels[i] - stats['dependent'] = self._dependent[i].cols[0] - stats['instruments'] = self._instr[i].cols if self._instr[i].shape[1] > 0 else None - stats['endog'] = self._endog[i].cols if self._endog[i].shape[1] > 0 else None - stats['method'] = method - stats['cov_type'] = cov_type - stats['cov_estimator'] = cov_est - stats['cov_config'] = cov_est.cov_config - stats['weight_estimator'] = weight_est - stats['index'] = self._dependent[i].rows - stats['original_index'] = self._original_index - stats['iter'] = iter_count - stats['debiased'] = debiased - stats['has_constant'] = bool(constant) - stats['constant_loc'] = self._constant_loc[i] + stats["eq_label"] = self._eq_labels[i] + stats["dependent"] = self._dependent[i].cols[0] + stats["instruments"] = ( + self._instr[i].cols if self._instr[i].shape[1] > 0 else None + ) + stats["endog"] = self._endog[i].cols if self._endog[i].shape[1] > 0 else None + stats["method"] = method + stats["cov_type"] = cov_type + stats["cov_estimator"] = cov_est + stats["cov_config"] = cov_est.cov_config + stats["weight_estimator"] = weight_est + stats["index"] = self._dependent[i].rows + stats["original_index"] = self._original_index + stats["iter"] = iter_count + stats["debiased"] = debiased + stats["has_constant"] = bool(constant) + stats["constant_loc"] = self._constant_loc[i] # Parameters, errors and measures of fit wxi = self._wx[i] nobs, df = wxi.shape - b = beta[loc:loc + df] + b = beta[loc : loc + df] e = wresid[:, [i]] nobs = e.shape[0] - df_c = (nobs - constant) - df_r = (nobs - df) - - stats['params'] = b - stats['cov'] = cov[loc:loc + df, loc:loc + df] - stats['wresid'] = e - stats['nobs'] = nobs - stats['df_model'] = df - stats['resid'] = resid[:, [i]] - stats['fitted'] = self._x[i] @ b - stats['resid_ss'] = float(resid[:, [i]].T @ resid[:, [i]]) - stats['total_ss'] = total_ss - stats['r2'] = 1.0 - stats.resid_ss / stats.total_ss - stats['r2a'] = 1.0 - (stats.resid_ss / df_r) / (stats.total_ss / df_c) - - names = self._param_names[loc:loc + df] + df_c = nobs - constant + df_r = nobs - df + + stats["params"] = b + stats["cov"] = cov[loc : loc + df, loc : loc + df] + stats["wresid"] = e + stats["nobs"] = nobs + stats["df_model"] = df + stats["resid"] = resid[:, [i]] + stats["fitted"] = self._x[i] @ b + stats["resid_ss"] = float(resid[:, [i]].T @ resid[:, [i]]) + stats["total_ss"] = total_ss + stats["r2"] = 1.0 - stats.resid_ss / stats.total_ss + stats["r2a"] = 1.0 - (stats.resid_ss / df_r) / (stats.total_ss / df_c) + + names = self._param_names[loc : loc + df] offset = len(stats.eq_label) + 1 - stats['param_names'] = [n[offset:] for n in names] + stats["param_names"] = [n[offset:] for n in names] # F-statistic - stats['f_stat'] = self._f_stat(stats, debiased) + stats["f_stat"] = self._f_stat(stats, debiased) return stats - def _common_results(self, beta, cov, method, iter_count, nobs, cov_type, - sigma, individual, debiased): + def _common_results( + self, beta, cov, method, iter_count, nobs, cov_type, sigma, individual, debiased + ): results = AttrDict() - results['method'] = method - results['iter'] = iter_count - results['nobs'] = nobs - results['cov_type'] = cov_type - results['index'] = self._dependent[0].rows - results['original_index'] = self._original_index - results['sigma'] = sigma - results['individual'] = individual - results['params'] = beta - results['df_model'] = beta.shape[0] - results['param_names'] = self._param_names - results['cov'] = cov - results['debiased'] = debiased + results["method"] = method + results["iter"] = iter_count + results["nobs"] = nobs + results["cov_type"] = cov_type + results["index"] = self._dependent[0].rows + results["original_index"] = self._original_index + names = list(individual.keys()) + results["sigma"] = DataFrame(sigma, columns=names, index=names) + results["individual"] = individual + results["params"] = beta + results["df_model"] = beta.shape[0] + results["param_names"] = self._param_names + results["cov"] = cov + results["debiased"] = debiased total_ss = resid_ss = 0.0 resid = [] @@ -1041,12 +1159,12 @@ def _common_results(self, beta, cov, method, iter_count, nobs, cov_type, resid.append(individual[key].resid) resid = np.hstack(resid) - results['resid_ss'] = resid_ss - results['total_ss'] = total_ss - results['r2'] = 1.0 - results.resid_ss / results.total_ss - results['resid'] = resid - results['constraints'] = self._constraints - results['model'] = self + results["resid_ss"] = resid_ss + results["total_ss"] = total_ss + results["r2"] = 1.0 - results.resid_ss / results.total_ss + results["resid"] = resid + results["constraints"] = self._constraints + results["model"] = self x = self._x k = len(x) @@ -1054,17 +1172,80 @@ def _common_results(self, beta, cov, method, iter_count, nobs, cov_type, fitted = [] for i in range(k): nb = x[i].shape[1] - b = beta[loc:loc + nb] + b = beta[loc : loc + nb] fitted.append(x[i] @ b) loc += nb fitted = np.hstack(fitted) - results['fitted'] = fitted + results["fitted"] = fitted return results - def _gls_finalize(self, beta, sigma, full_sigma, gls_eps, eps, - cov_type, iter_count, **cov_config): + def _system_r2(self, eps, sigma, method, full_cov, debiased, r2s): + sigma_resid = sigma + + # System regression on a constant using weights if provided + wy, w = self._wy, self._w + wi = [np.sqrt(weights) for weights in w] + if method == "ols": + est_sigma = np.eye(len(wy)) + else: # gls + est_sigma = sigma + if not full_cov: + est_sigma = np.diag(np.diag(est_sigma)) + est_sigma_inv = inv(est_sigma) + nobs = wy[0].shape[0] + k = len(wy) + xpx = blocked_inner_prod(wi, est_sigma_inv) + xpy = np.zeros((k, 1)) + for i in range(k): + sy = np.zeros((nobs, 1)) + for j in range(k): + sy += est_sigma_inv[i, j] * wy[j] + xpy[i : (i + 1)] = wi[i].T @ sy + + mu = _parameters_from_xprod(xpx, xpy) + eps_const = np.hstack([self._y[j] - mu[j] for j in range(k)]) + # Judge + judge = 1 - (eps ** 2).sum() / (eps_const ** 2).sum() + # Dhrymes + tot_eps_const_sq = (eps_const ** 2).sum(0) + r2s = np.asarray(r2s) + dhrymes = (r2s * tot_eps_const_sq).sum() / tot_eps_const_sq.sum() + + # Berndt + sigma_y = (eps_const.T @ eps_const / nobs) * self._sigma_scale(debiased) + berndt = np.nan + # Avoid division by 0 + if np.linalg.det(sigma_y) > 0: + berndt = 1 - np.linalg.det(sigma_resid) / np.linalg.det(sigma_y) + + mcelroy = np.nan + # Check that the matrix is invertible + if np.linalg.matrix_rank(sigma) == sigma.shape[0]: + # McElroy + sigma_m12 = inv_matrix_sqrt(sigma) + std_eps = eps @ sigma_m12 + numerator = (std_eps ** 2).sum() + std_eps_const = eps_const @ sigma_m12 + denom = (std_eps_const ** 2).sum() + mcelroy = 1.0 - numerator / denom + r2 = dict(mcelroy=mcelroy, berndt=berndt, judge=judge, dhrymes=dhrymes) + return Series(r2) + + def _gls_finalize( + self, + beta, + sigma, + full_sigma, + est_sigma, + gls_eps, + eps, + full_cov, + cov_type, + iter_count, + **cov_config + ): """Collect results to return after GLS estimation""" k = len(self._wy) @@ -1072,14 +1253,21 @@ def _gls_finalize(self, beta, sigma, full_sigma, gls_eps, eps, cov_est = COV_EST[cov_type] gls_eps = np.reshape(gls_eps, (k, gls_eps.shape[0] // k)).T eps = np.reshape(eps, (k, eps.shape[0] // k)).T - cov_est = cov_est(self._wxhat, gls_eps, sigma, full_sigma, gls=True, - constraints=self._constraints, **cov_config) + cov_est = cov_est( + self._wxhat, + gls_eps, + sigma, + full_sigma, + gls=True, + constraints=self._constraints, + **cov_config + ) cov = cov_est.cov # Repackage results for individual equations individual = AttrDict() - debiased = cov_config.get('debiased', False) - method = 'Iterative GLS' if iter_count > 1 else 'GLS' + debiased = cov_config.get("debiased", False) + method = "Iterative GLS" if iter_count > 1 else "GLS" for i in range(k): cons = int(self.has_constant.iloc[i]) @@ -1089,25 +1277,50 @@ def _gls_finalize(self, beta, sigma, full_sigma, gls_eps, eps, else: ye = self._wy[i] total_ss = float(ye.T @ ye) - stats = self._common_indiv_results(i, beta, cov, gls_eps, eps, - method, cov_type, cov_est, iter_count, - debiased, cons, total_ss) + stats = self._common_indiv_results( + i, + beta, + cov, + gls_eps, + eps, + method, + cov_type, + cov_est, + iter_count, + debiased, + cons, + total_ss, + ) key = self._eq_labels[i] individual[key] = stats # Populate results dictionary nobs = eps.size - results = self._common_results(beta, cov, method, iter_count, nobs, - cov_type, sigma, individual, debiased) + results = self._common_results( + beta, + cov, + method, + iter_count, + nobs, + cov_type, + est_sigma, + individual, + debiased, + ) # wresid is different between GLS and OLS wresid = [] for key in individual: wresid.append(individual[key].wresid) wresid = np.hstack(wresid) - results['wresid'] = wresid - results['cov_estimator'] = cov_est - results['cov_config'] = cov_est.cov_config + results["wresid"] = wresid + results["cov_estimator"] = cov_est + results["cov_config"] = cov_est.cov_config + individual = results["individual"] + r2s = [individual[eq].r2 for eq in individual] + results["system_r2"] = self._system_r2( + eps, sigma, "gls", full_cov, debiased, r2s + ) return SystemResults(results) @@ -1153,8 +1366,9 @@ def add_constraints(self, r, q=None): The property `param_names` can be used to determine the order of parameters. """ - self._constraints = LinearConstraint(r, q=q, num_params=len(self._param_names), - require_pandas=True) + self._constraints = LinearConstraint( + r, q=q, num_params=len(self._param_names), require_pandas=True + ) def reset_constraints(self): """Remove all model constraints""" @@ -1185,7 +1399,7 @@ class SUR(IV3SLS): value must be either a tuple of the form (dependent, exog, [weights]) or a dictionary with keys 'dependent' and 'exog' and the optional key 'weights'. - sigma : array-like + sigma : array_like Prespecified residual covariance to use in GLS estimation. If not provided, FGLS is implemented based on an estimate of sigma. @@ -1251,10 +1465,10 @@ class SUR(IV3SLS): def __init__(self, equations, *, sigma=None): if not isinstance(equations, Mapping): - raise TypeError('equations must be a dictionary-like') + raise TypeError("equations must be a dictionary-like") for key in equations: if not isinstance(key, str): - raise ValueError('Equation labels (keys) must be strings') + raise ValueError("Equation labels (keys) must be strings") reformatted = equations.__class__() for key in equations: eqn = equations[key] @@ -1267,7 +1481,7 @@ def __init__(self, equations, *, sigma=None): eqn = eqn + (None, None) reformatted[key] = eqn super(SUR, self).__init__(reformatted, sigma=sigma) - self._model_name = 'Seemingly Unrelated Regression (SUR)' + self._model_name = "Seemingly Unrelated Regression (SUR)" @classmethod def multivariate_ls(cls, dependent, exog): @@ -1276,9 +1490,9 @@ def multivariate_ls(cls, dependent, exog): Parameters ---------- - dependent : array-like + dependent : array_like nobs by ndep array of dependent variables - exog : array-like + exog : array_like nobs by nvar array of exogenous regressors common to all models Returns @@ -1306,8 +1520,8 @@ def multivariate_ls(cls, dependent, exog): >>> mod = SUR.multivariate_ls(portfolios, factors) """ equations = OrderedDict() - dependent = IVData(dependent, var_name='dependent') - exog = IVData(exog, var_name='exog') + dependent = IVData(dependent, var_name="dependent") + exog = IVData(exog, var_name="exog") for col in dependent.pandas: equations[col] = (dependent.pandas[[col]], exog.pandas) return cls(equations) @@ -1325,7 +1539,7 @@ def from_formula(cls, formula, data, *, sigma=None, weights=None): description of the accepted syntax data : DataFrame Frame containing named variables - sigma : array-like + sigma : array_like Prespecified residual covariance to use in GLS estimation. If not provided, FGLS is implemented based on an estimate of sigma. weights : dict-like @@ -1388,7 +1602,7 @@ class IVSystemGMM(IV3SLS): 'exog'. The dictionary may contain optional keys for 'endog', 'instruments', and 'weights'. Endogenous and/or Instrument can be empty if all variables in an equation are exogenous. - sigma : array-like + sigma : array_like Prespecified residual covariance to use in GLS estimation. If not provided, FGLS is implemented based on an estimate of sigma. Only used if weight_type is 'unadjusted' @@ -1448,24 +1662,35 @@ class IVSystemGMM(IV3SLS): where :math:`W` is a positive definite weighting matrix. """ - def __init__(self, equations, *, sigma=None, weight_type='robust', **weight_config): + def __init__(self, equations, *, sigma=None, weight_type="robust", **weight_config): super().__init__(equations, sigma=sigma) self._weight_type = weight_type self._weight_config = weight_config if weight_type not in COV_TYPES: - raise ValueError('Unknown estimator for weight_type') + raise ValueError("Unknown estimator for weight_type") - if weight_type not in ('unadjusted', 'homoskedastic') and sigma is not None: + if weight_type not in ("unadjusted", "homoskedastic") and sigma is not None: import warnings - warnings.warn('sigma has been provided but the estimated weight ' - 'matrix not unadjusted (homoskedastic). sigma will ' - 'be ignored.', UserWarning) + + warnings.warn( + "sigma has been provided but the estimated weight " + "matrix not unadjusted (homoskedastic). sigma will " + "be ignored.", + UserWarning, + ) weight_type = COV_TYPES[weight_type] self._weight_est = GMM_W_EST[weight_type](**weight_config) - def fit(self, *, iter_limit=2, tol=1e-6, initial_weight=None, - cov_type='robust', **cov_config): + def fit( + self, + *, + iter_limit=2, + tol=1e-6, + initial_weight=None, + cov_type="robust", + **cov_config + ): """ Estimate model parameters @@ -1496,7 +1721,7 @@ def fit(self, *, iter_limit=2, tol=1e-6, initial_weight=None, Estimation results """ if cov_type not in COV_TYPES: - raise ValueError('Unknown cov_type: {0}'.format(cov_type)) + raise ValueError("Unknown cov_type: {0}".format(cov_type)) # Parameter estimation wx, wy, wz = self._wx, self._wy, self._wz k = len(wx) @@ -1506,12 +1731,14 @@ def fit(self, *, iter_limit=2, tol=1e-6, initial_weight=None, w = blocked_inner_prod(wz, np.eye(k_total)) / nobs else: w = initial_weight - beta_last = beta = self._blocked_gmm(wx, wy, wz, w=w, constraints=self.constraints) + beta_last = beta = self._blocked_gmm( + wx, wy, wz, w=w, constraints=self.constraints + ) eps = [] loc = 0 for i in range(k): nb = wx[i].shape[1] - b = beta[loc:loc + nb] + b = beta[loc : loc + nb] eps.append(wy[i] - wx[i] @ b) loc += nb eps = np.hstack(eps) @@ -1520,7 +1747,9 @@ def fit(self, *, iter_limit=2, tol=1e-6, initial_weight=None, iters = 1 norm = 10 * tol + 1 while iters < iter_limit and norm > tol: - sigma = self._weight_est.sigma(eps, wx) if self._sigma is None else self._sigma + sigma = ( + self._weight_est.sigma(eps, wx) if self._sigma is None else self._sigma + ) w = self._weight_est.weight_matrix(wx, wz, eps, sigma=sigma) beta = self._blocked_gmm(wx, wy, wz, w=w, constraints=self.constraints) delta = beta_last - beta @@ -1537,7 +1766,7 @@ def fit(self, *, iter_limit=2, tol=1e-6, initial_weight=None, loc = 0 for i in range(k): nb = wx[i].shape[1] - b = beta[loc:loc + nb] + b = beta[loc : loc + nb] eps.append(wy[i] - wx[i] @ b) loc += nb eps = np.hstack(eps) @@ -1545,7 +1774,9 @@ def fit(self, *, iter_limit=2, tol=1e-6, initial_weight=None, cov_type = COV_TYPES[cov_type] cov_est = GMM_COV_EST[cov_type] - cov = cov_est(wx, wz, eps, w, sigma=sigma, constraints=self._constraints, **cov_config) + cov = cov_est( + wx, wz, eps, w, sigma=sigma, constraints=self._constraints, **cov_config + ) weps = eps eps = [] @@ -1553,13 +1784,14 @@ def fit(self, *, iter_limit=2, tol=1e-6, initial_weight=None, x, y = self._x, self._y for i in range(k): nb = x[i].shape[1] - b = beta[loc:loc + nb] + b = beta[loc : loc + nb] eps.append(y[i] - x[i] @ b) loc += nb eps = np.hstack(eps) iters += 1 - return self._finalize_results(beta, cov.cov, weps, eps, w, sigma, - iters - 1, cov_type, cov_config, cov) + return self._finalize_results( + beta, cov.cov, weps, eps, w, sigma, iters - 1, cov_type, cov_config, cov + ) @staticmethod def _blocked_gmm(x, y, z, *, w=None, constraints=None): @@ -1576,16 +1808,27 @@ def _blocked_gmm(x, y, z, *, w=None, constraints=None): return params - def _finalize_results(self, beta, cov, weps, eps, wmat, sigma, - iter_count, cov_type, cov_config, cov_est): + def _finalize_results( + self, + beta, + cov, + weps, + eps, + wmat, + sigma, + iter_count, + cov_type, + cov_config, + cov_est, + ): """Collect results to return after GLS estimation""" k = len(self._wy) # Repackage results for individual equations individual = AttrDict() - debiased = cov_config.get('debiased', False) - method = '{0}-Step System GMM'.format(iter_count) + debiased = cov_config.get("debiased", False) + method = "{0}-Step System GMM".format(iter_count) if iter_count > 2: - method = 'Iterative System GMM' + method = "Iterative System GMM" for i in range(k): cons = int(self.has_constant.iloc[i]) @@ -1595,37 +1838,53 @@ def _finalize_results(self, beta, cov, weps, eps, wmat, sigma, else: ye = self._wy[i] total_ss = float(ye.T @ ye) - stats = self._common_indiv_results(i, beta, cov, weps, eps, - method, cov_type, cov_est, - iter_count, debiased, cons, total_ss, - weight_est=self._weight_est) + stats = self._common_indiv_results( + i, + beta, + cov, + weps, + eps, + method, + cov_type, + cov_est, + iter_count, + debiased, + cons, + total_ss, + weight_est=self._weight_est, + ) key = self._eq_labels[i] individual[key] = stats # Populate results dictionary nobs = eps.size - results = self._common_results(beta, cov, method, iter_count, nobs, - cov_type, sigma, individual, debiased) + results = self._common_results( + beta, cov, method, iter_count, nobs, cov_type, sigma, individual, debiased + ) # wresid is different between GLS and OLS wresid = [] for key in individual: wresid.append(individual[key].wresid) wresid = np.hstack(wresid) - results['wresid'] = wresid - results['wmat'] = wmat - results['weight_type'] = self._weight_type - results['weight_config'] = self._weight_est.config - results['cov_estimator'] = cov_est - results['cov_config'] = cov_est.cov_config - results['weight_estimator'] = self._weight_est - results['j_stat'] = self._j_statistic(beta, wmat) + results["wresid"] = wresid + results["wmat"] = wmat + results["weight_type"] = self._weight_type + results["weight_config"] = self._weight_est.config + results["cov_estimator"] = cov_est + results["cov_config"] = cov_est.cov_config + results["weight_estimator"] = self._weight_est + results["j_stat"] = self._j_statistic(beta, wmat) + r2s = [individual[eq].r2 for eq in individual] + results["system_r2"] = self._system_r2(eps, sigma, "gls", False, debiased, r2s) return GMMSystemResults(results) @classmethod - def from_formula(cls, formula, data, *, weights=None, weight_type='robust', **weight_config): + def from_formula( + cls, formula, data, *, weights=None, weight_type="robust", **weight_config + ): """ Specify a 3SLS using the formula interface @@ -1721,7 +1980,7 @@ def _j_statistic(self, params, weight_mat): idx = 0 for i in range(k): kx = x[i].shape[1] - beta = params[idx:idx + kx] + beta = params[idx : idx + kx] eps = y[i] - x[i] @ beta ze.append(z[i] * eps) idx += kx @@ -1729,7 +1988,7 @@ def _j_statistic(self, params, weight_mat): g_bar = ze.mean(0) nobs = x[0].shape[0] stat = float(nobs * g_bar.T @ np.linalg.inv(weight_mat) @ g_bar.T) - null = 'Expected moment conditions are equal to 0' + null = "Expected moment conditions are equal to 0" ninstr = sum(map(lambda a: a.shape[1], z)) nvar = sum(map(lambda a: a.shape[1], x)) ncons = 0 if self.constraints is None else self.constraints.r.shape[0] diff --git a/linearmodels/system/results.py b/linearmodels/system/results.py index b4724633b2..475c24b6b3 100644 --- a/linearmodels/system/results.py +++ b/linearmodels/system/results.py @@ -2,16 +2,16 @@ import datetime as dt -from property_cached import cached_property import numpy as np from pandas import DataFrame, Series, concat +from property_cached import cached_property from scipy import stats from statsmodels.iolib.summary import SimpleTable, fmt_2cols from linearmodels.utility import (AttrDict, _str, _SummaryStr, format_wide, param_table, pval_format) -__all__ = ['SystemResults', 'SystemEquationResult', 'GMMSystemResults'] +__all__ = ["SystemResults", "SystemEquationResult", "GMMSystemResults"] class _CommonResults(_SummaryStr): @@ -46,8 +46,7 @@ def method(self): @property def cov(self): """Estimated covariance of parameters""" - return DataFrame(self._cov, index=self._param_names, - columns=self._param_names) + return DataFrame(self._cov, index=self._param_names, columns=self._param_names) @property def cov_estimator(self): @@ -72,18 +71,18 @@ def debiased(self): @property def params(self): """Estimated parameters""" - return Series(self._params.squeeze(), index=self._param_names, name='params') + return Series(self._params.squeeze(), index=self._param_names, name="params") @property def std_errors(self): """Estimated parameter standard errors""" std_errors = np.sqrt(np.diag(self.cov)) - return Series(std_errors, index=self._param_names, name='stderr') + return Series(std_errors, index=self._param_names, name="stderr") @property def tstats(self): """Parameter t-statistics""" - return Series(self.params / self.std_errors, name='tstat') + return Series(self.params / self.std_errors, name="tstat") @cached_property def pvalues(self): @@ -95,11 +94,33 @@ def pvalues(self): else: pvals = 2 - 2 * stats.norm.cdf(np.abs(self.tstats)) - return Series(pvals, index=self._param_names, name='pvalue') + return Series(pvals, index=self._param_names, name="pvalue") @property def rsquared(self): - """Coefficient of determination (R**2)""" + r""" + Coefficient of determination (R2) + + Returns + ------- + float + The coefficient of determinations. + + Notes + ----- + The overall R2 is similar to Judge's system R2 since no weighting is + used. These two only differ if one or more equations do not include + constants. It is defined as + + .. math:: + + 1 - \frac{\sum_i \sum_j \hat{\epsilon}_{ij}^2}{\sum_i \sum_j \hat{\eta}_{ij}^2} + + where :math:`\eta` is the residual from a regression on only a + constant. Note that if a constant is not present in an equation + then the term in the denominator is **not** demeaned so that + :math:`\hat{\eta}_{ij}=y_{ij}`. + """ return self._r2 @property @@ -143,7 +164,7 @@ def conf_int(self, level=0.95): Returns ------- - ci : DataFrame + DataFrame Confidence interval of the form [lower, upper] for each parameters Notes @@ -157,7 +178,7 @@ def conf_int(self, level=0.95): q = stats.norm.ppf(ci_quantiles) q = q[None, :] ci = self.params[:, None] + self.std_errors[:, None] * q - return DataFrame(ci, index=self._param_names, columns=['lower', 'upper']) + return DataFrame(ci, index=self._param_names, columns=["lower", "upper"]) class SystemResults(_CommonResults): @@ -175,13 +196,14 @@ def __init__(self, results): self._individual = AttrDict() for key in results.individual: self._individual[key] = SystemEquationResult(results.individual[key]) + self._system_r2 = results.system_r2 self._sigma = results.sigma self._model = results.model self._constraints = results.constraints - self._num_constraints = 'None' + self._num_constraints = "None" if results.constraints is not None: self._num_constraints = str(results.constraints.r.shape[0]) - self._weight_estimtor = results.get('weight_estimator', None) + self._weight_estimtor = results.get("weight_estimator", None) @property def model(self): @@ -210,21 +232,33 @@ def fitted_values(self): def _out_of_sample(self, equations, data, missing, dataframe): if equations is not None and data is not None: - raise ValueError('Predictions can only be constructed using one ' - 'of eqns or data, but not both.') - pred = self.model.predict(self.params, equations=equations, data=data) # type: DataFrame + raise ValueError( + "Predictions can only be constructed using one " + "of eqns or data, but not both." + ) + pred = self.model.predict( + self.params, equations=equations, data=data + ) # type: DataFrame if not dataframe: pred = {col: pred[[col]] for col in pred} if not missing: for key in pred: pred[key] = pred[key].dropna() else: - pred = pred.dropna(how='all', axis=1) + pred = pred.dropna(how="all", axis=1) return pred - def predict(self, equations=None, *, data=None, fitted=True, - idiosyncratic=False, missing=False, dataframe=False): + def predict( + self, + equations=None, + *, + data=None, + fitted=True, + idiosyncratic=False, + missing=False, + dataframe=False + ): """ In- and out-of-sample predictions @@ -254,7 +288,7 @@ def predict(self, equations=None, *, data=None, fitted=True, Returns ------- - predictions : DataFrame, dict + predictions : {DataFrame, dict} DataFrame or dictionary containing selected outputs Notes @@ -278,14 +312,17 @@ def predict(self, equations=None, *, data=None, fitted=True, if equations is not None or data is not None: return self._out_of_sample(equations, data, missing, dataframe) if not (fitted or idiosyncratic): - raise ValueError('At least one output must be selected') + raise ValueError("At least one output must be selected") if dataframe: if fitted and not idiosyncratic: out = self.fitted_values elif idiosyncratic and not fitted: out = self.resids else: - out = {'fitted_values': self.fitted_values, 'idiosyncratic': self.resids} + out = { + "fitted_values": self.fitted_values, + "idiosyncratic": self.resids, + } else: out = {} for key in self.equation_labels: @@ -314,6 +351,73 @@ def sigma(self): """Estimated residual covariance""" return self._sigma + @property + def system_rsquared(self): + r""" + Alternative measure of system fit + + Returns + ------- + Series + The measures of overall system fit. + + Notes + ----- + McElroy's R2 is defined as + + .. math:: + + 1 - \frac{SSR_{\Omega}}{TSS_{\Omega}} + + where + + .. math:: + + SSR_{\Omega} = \hat{\epsilon}^\prime\hat{\Omega}^{-1}\hat{\epsilon} + + and + + .. math:: + + TSS_{\Omega} = \hat{\eta}^\prime\hat{\Omega}^{-1}\hat{\eta} + + where :math:`\eta` is the residual from a regression on only a constant. + + Judge's system R2 is defined as + + .. math:: + + 1 - \frac{\sum_i \sum_j \hat{\epsilon}_ij^2}{\sum_i \sum_j \hat{\eta}_ij^2} + + where :math:`\eta` is the residual from a regression on only a constant. + + Berndt's system R2 is defined as + + .. math:: + + 1 - \frac{|\hat{\Sigma}_\epsilon|}{|\hat{\Sigma}_\eta|} + + where :math:`\hat{\Sigma}_\epsilon` and :math:`\hat{\Sigma}_\eta` are the + estimated covariances :math:`\epsilon` and :math:`\eta`, respectively. + + Dhrymes's system R2 is defined as a weighted average of the R2 of each + equation + + .. math:: + + \sum__i w_i R^2_i + + where the weight is + + .. math:: + + w_i = \frac{\hat{\Sigma}_{\eta}^{[ii]}}{\tr{\hat{\Sigma}_{\eta}}} + + the ratio of the variance the dependent in an equation to the total + variance of all dependent variables. + """ + return self._system_r2 + @property def summary(self): """:obj:`statsmodels.iolib.summary.Summary` : Summary table of model estimation results @@ -322,23 +426,27 @@ def summary(self): ``summary.as_html()`` and ``summary.as_latex()``. """ - title = 'System ' + self._method + ' Estimation Summary' - - top_left = [('Estimator:', self._method), - ('No. Equations.:', str(len(self.equation_labels))), - ('No. Observations:', str(self.resids.shape[0])), - ('Date:', self._datetime.strftime('%a, %b %d %Y')), - ('Time:', self._datetime.strftime('%H:%M:%S')), - ('', ''), - ('', '')] - - top_right = [('Overall R-squared:', _str(self.rsquared)), - ('Cov. Estimator:', self._cov_type), - ('Num. Constraints: ', self._num_constraints), - ('', ''), - ('', ''), - ('', ''), - ('', '')] + title = "System " + self._method + " Estimation Summary" + + top_left = [ + ("Estimator:", self._method), + ("No. Equations.:", str(len(self.equation_labels))), + ("No. Observations:", str(self.resids.shape[0])), + ("Date:", self._datetime.strftime("%a, %b %d %Y")), + ("Time:", self._datetime.strftime("%H:%M:%S")), + ("", ""), + ("", ""), + ] + + top_right = [ + ("Overall R-squared:", _str(self.rsquared)), + ("McElroy's R-squared:", _str(self.system_rsquared.mcelroy)), + ("Judge's (OLS) R-squared:", _str(self.system_rsquared.judge)), + ("Berndt's R-squared:", _str(self.system_rsquared.berndt)), + ("Dhrymes's R-squared:", _str(self.system_rsquared.dhrymes)), + ("Cov. Estimator:", self._cov_type), + ("Num. Constraints: ", self._num_constraints), + ] stubs = [] vals = [] @@ -352,9 +460,9 @@ def summary(self): # Top Table # Parameter table fmt = fmt_2cols - fmt['data_fmts'][1] = '%10s' + fmt["data_fmts"][1] = "%10s" - top_right = [('%-21s' % (' ' + k), v) for k, v in top_right] + top_right = [("%-21s" % (" " + k), v) for k, v in top_right] stubs = [] vals = [] for stub, val in top_right: @@ -367,20 +475,20 @@ def summary(self): last_row = i == (len(self.equation_labels) - 1) results = self.equations[eqlabel] dep_name = results.dependent - title = 'Equation: {0}, Dependent Variable: {1}'.format(eqlabel, dep_name) + title = "Equation: {0}, Dependent Variable: {1}".format(eqlabel, dep_name) pad_bottom = results.instruments is not None and not last_row smry.tables.append(param_table(results, title, pad_bottom=pad_bottom)) if results.instruments: formatted = format_wide(results.instruments, 80) if not last_row: - formatted.append([' ']) - smry.tables.append(SimpleTable(formatted, headers=['Instruments'])) - extra_text = ['Covariance Estimator:'] - for line in str(self._cov_estimator).split('\n'): + formatted.append([" "]) + smry.tables.append(SimpleTable(formatted, headers=["Instruments"])) + extra_text = ["Covariance Estimator:"] + for line in str(self._cov_estimator).split("\n"): extra_text.append(line) if self._weight_estimtor: - extra_text.append('Weight Estimator:') - for line in str(self._weight_estimtor).split('\n'): + extra_text.append("Weight Estimator:") + for line in str(self._weight_estimtor).split("\n"): extra_text.append(line) smry.add_extra_txt(extra_text) @@ -405,7 +513,7 @@ def __init__(self, results): self._r2a = results.r2a self._instruments = results.instruments self._endog = results.endog - self._weight_estimator = results.get('weight_estimator', None) + self._weight_estimator = results.get("weight_estimator", None) @property def equation_label(self): @@ -430,24 +538,27 @@ def summary(self): ``summary.as_html()`` and ``summary.as_latex()``. """ - title = self._method + ' Estimation Summary' - - top_left = [('Eq. Label:', self.equation_label), - ('Dep. Variable:', self.dependent), - ('Estimator:', self._method), - ('No. Observations:', self.nobs), - ('Date:', self._datetime.strftime('%a, %b %d %Y')), - ('Time:', self._datetime.strftime('%H:%M:%S')), - - ('', '')] - - top_right = [('R-squared:', _str(self.rsquared)), - ('Adj. R-squared:', _str(self.rsquared_adj)), - ('Cov. Estimator:', self._cov_type), - ('F-statistic:', _str(self.f_statistic.stat)), - ('P-value (F-stat)', pval_format(self.f_statistic.pval)), - ('Distribution:', str(self.f_statistic.dist_name)), - ('', '')] + title = self._method + " Estimation Summary" + + top_left = [ + ("Eq. Label:", self.equation_label), + ("Dep. Variable:", self.dependent), + ("Estimator:", self._method), + ("No. Observations:", self.nobs), + ("Date:", self._datetime.strftime("%a, %b %d %Y")), + ("Time:", self._datetime.strftime("%H:%M:%S")), + ("", ""), + ] + + top_right = [ + ("R-squared:", _str(self.rsquared)), + ("Adj. R-squared:", _str(self.rsquared_adj)), + ("Cov. Estimator:", self._cov_type), + ("F-statistic:", _str(self.f_statistic.stat)), + ("P-value (F-stat)", pval_format(self.f_statistic.pval)), + ("Distribution:", str(self.f_statistic.dist_name)), + ("", ""), + ] stubs = [] vals = [] @@ -461,9 +572,9 @@ def summary(self): # Top Table # Parameter table fmt = fmt_2cols - fmt['data_fmts'][1] = '%10s' + fmt["data_fmts"][1] = "%10s" - top_right = [('%-21s' % (' ' + k), v) for k, v in top_right] + top_right = [("%-21s" % (" " + k), v) for k, v in top_right] stubs = [] vals = [] for stub, val in top_right: @@ -471,22 +582,22 @@ def summary(self): vals.append([val]) table.extend_right(SimpleTable(vals, stubs=stubs)) smry.tables.append(table) - smry.tables.append(param_table(self, 'Parameter Estimates', pad_bottom=True)) + smry.tables.append(param_table(self, "Parameter Estimates", pad_bottom=True)) extra_text = [] instruments = self._instruments if instruments: endog = self._endog extra_text = [] - extra_text.append('Endogenous: ' + ', '.join(endog)) - extra_text.append('Instruments: ' + ', '.join(instruments)) + extra_text.append("Endogenous: " + ", ".join(endog)) + extra_text.append("Instruments: " + ", ".join(instruments)) - extra_text.append('Covariance Estimator:') - for line in str(self._cov_estimator).split('\n'): + extra_text.append("Covariance Estimator:") + for line in str(self._cov_estimator).split("\n"): extra_text.append(line) if self._weight_estimator: - extra_text.append('Weight Estimator:') - for line in str(self._weight_estimator).split('\n'): + extra_text.append("Weight Estimator:") + for line in str(self._weight_estimator).split("\n"): extra_text.append(line) smry.add_extra_txt(extra_text) @@ -499,7 +610,7 @@ def f_statistic(self): Returns ------- - f : WaldTestStatistic + WaldTestStatistic Test statistic for null all coefficients excluding constant terms are zero. @@ -518,17 +629,17 @@ def f_statistic(self): @property def resids(self): """Estimated residuals""" - return Series(self._resid.squeeze(), index=self._index, name='resid') + return Series(self._resid.squeeze(), index=self._index, name="resid") @property def wresids(self): """Weighted estimated residuals""" - return Series(self._wresid.squeeze(), index=self._index, name='wresid') + return Series(self._wresid.squeeze(), index=self._index, name="wresid") @property def fitted_values(self): """Fitted values""" - return Series(self._fitted.squeeze(), index=self._index, name='fitted_values') + return Series(self._fitted.squeeze(), index=self._index, name="fitted_values") @property def rsquared_adj(self): @@ -575,7 +686,7 @@ def j_stat(self): Returns ------- - j : WaldTestStatistic + WaldTestStatistic J statistic test of overidentifying restrictions Notes diff --git a/linearmodels/tests/asset_pricing/_utility.py b/linearmodels/tests/asset_pricing/_utility.py index 8856e3cc7c..b13733aa1e 100644 --- a/linearmodels/tests/asset_pricing/_utility.py +++ b/linearmodels/tests/asset_pricing/_utility.py @@ -4,8 +4,9 @@ from linearmodels.utility import AttrDict -def generate_data(nfactor=3, nportfolio=25, nobs=1000, premia=None, output='pandas', - alpha=False): +def generate_data( + nfactor=3, nportfolio=25, nobs=1000, premia=None, output="pandas", alpha=False +): np.random.seed(12345) if premia is None: premia = np.arange(1, nfactor + 1) / (10 * nfactor) @@ -19,16 +20,12 @@ def generate_data(nfactor=3, nportfolio=25, nobs=1000, premia=None, output='pand portfolios = factors @ betas + idio if alpha: portfolios += np.arange(nportfolio)[None, :] / nportfolio / 100 - index = pd.date_range('1930-1-1', periods=nobs, freq='D') - if output == 'pandas': - cols = ['factor_{0}'.format(i) for i in range(1, nfactor + 1)] - factors = pd.DataFrame(factors, - columns=cols, - index=index) - cols = ['port_{0}'.format(i) for i in range(1, nportfolio + 1)] - portfolios = pd.DataFrame(portfolios, - columns=cols, - index=index) + index = pd.date_range("1930-1-1", periods=nobs, freq="D") + if output == "pandas": + cols = ["factor_{0}".format(i) for i in range(1, nfactor + 1)] + factors = pd.DataFrame(factors, columns=cols, index=index) + cols = ["port_{0}".format(i) for i in range(1, nportfolio + 1)] + portfolios = pd.DataFrame(portfolios, columns=cols, index=index) return AttrDict(factors=factors, portfolios=portfolios) @@ -36,7 +33,7 @@ def generate_data(nfactor=3, nportfolio=25, nobs=1000, premia=None, output='pand def get_all(res): attrs = dir(res) for attr_name in attrs: - if attr_name.startswith('_'): + if attr_name.startswith("_"): continue attr = getattr(res, attr_name) if callable(attr): diff --git a/linearmodels/tests/asset_pricing/test_covariance.py b/linearmodels/tests/asset_pricing/test_covariance.py index a8cc540394..74b49fb21a 100644 --- a/linearmodels/tests/asset_pricing/test_covariance.py +++ b/linearmodels/tests/asset_pricing/test_covariance.py @@ -14,17 +14,16 @@ def data(): moments = np.random.randn(500, 10) jacobian = np.random.rand(10, 8) jacobian_inv = np.eye(10) - return AttrDict(moments=moments, jacobian=jacobian, - inv_jacobian=jacobian_inv) + return AttrDict(moments=moments, jacobian=jacobian, inv_jacobian=jacobian_inv) def test_kernel_errors(data): with pytest.raises(ValueError): - KernelWeight(data.moments, kernel='unknown') + KernelWeight(data.moments, kernel="unknown") with pytest.raises(ValueError): - KernelWeight(data.moments, bandwidth=-.5) + KernelWeight(data.moments, bandwidth=-0.5) with pytest.raises(ValueError): - KernelCovariance(data.moments, jacobian=data.jacobian, kernel='unknown') + KernelCovariance(data.moments, jacobian=data.jacobian, kernel="unknown") with pytest.raises(ValueError): KernelCovariance(data.moments, jacobian=data.jacobian, bandwidth=-4) @@ -33,7 +32,9 @@ def test_no_jacobian(data): with pytest.raises(ValueError): KernelCovariance(data.moments) with pytest.raises(ValueError): - KernelCovariance(data.moments, jacobian=data.jacobian, inv_jacobian=data.inv_jacobian) + KernelCovariance( + data.moments, jacobian=data.jacobian, inv_jacobian=data.inv_jacobian + ) def test_alt_jacobians(data): diff --git a/linearmodels/tests/asset_pricing/test_formulas.py b/linearmodels/tests/asset_pricing/test_formulas.py index 1b56d8bd25..53597b4e94 100644 --- a/linearmodels/tests/asset_pricing/test_formulas.py +++ b/linearmodels/tests/asset_pricing/test_formulas.py @@ -9,28 +9,31 @@ TradedFactorModel) from linearmodels.tests.asset_pricing._utility import generate_data -FORMULA_FACTORS = 'factor_1 + factor_2 + factor_3' -FORMULA_PORT = 'port_1 + port_2 + port_3 + port_4 + port_5 + port_6 + port_7 + ' \ - 'port_8 + port_9 + port_10' -FORMULA = ' ~ '.join((FORMULA_PORT, FORMULA_FACTORS)) +FORMULA_FACTORS = "factor_1 + factor_2 + factor_3" +FORMULA_PORT = ( + "port_1 + port_2 + port_3 + port_4 + port_5 + port_6 + port_7 + " + "port_8 + port_9 + port_10" +) +FORMULA = " ~ ".join((FORMULA_PORT, FORMULA_FACTORS)) -@pytest.fixture(scope='module', params=[TradedFactorModel, LinearFactorModel, - LinearFactorModelGMM]) +@pytest.fixture( + scope="module", params=[TradedFactorModel, LinearFactorModel, LinearFactorModelGMM] +) def model(request): return request.param -@pytest.fixture(scope='module', params=[LinearFactorModel, LinearFactorModelGMM]) +@pytest.fixture(scope="module", params=[LinearFactorModel, LinearFactorModelGMM]) def non_traded_model(request): return request.param -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): - premia = np.array([.1, .1, .1]) - out = generate_data(nportfolio=10, output='pandas', alpha=True, premia=premia) - out['joined'] = concat([out.factors, out.portfolios], 1) + premia = np.array([0.1, 0.1, 0.1]) + out = generate_data(nportfolio=10, output="pandas", alpha=True, premia=premia) + out["joined"] = concat([out.factors, out.portfolios], 1) return out @@ -68,8 +71,9 @@ def test_non_traded_risk_free(data, non_traded_model): assert mod1.formula == FORMULA assert mod2.formula is None - mod1 = non_traded_model.from_formula(FORMULA_FACTORS, data.joined, - portfolios=data.portfolios, risk_free=True) + mod1 = non_traded_model.from_formula( + FORMULA_FACTORS, data.joined, portfolios=data.portfolios, risk_free=True + ) mod2 = non_traded_model(data.portfolios, data.factors, risk_free=True) res1 = mod1.fit() res2 = mod2.fit() diff --git a/linearmodels/tests/asset_pricing/test_linear_factor_gmm.py b/linearmodels/tests/asset_pricing/test_linear_factor_gmm.py index eef9319eea..59d33204e0 100644 --- a/linearmodels/tests/asset_pricing/test_linear_factor_gmm.py +++ b/linearmodels/tests/asset_pricing/test_linear_factor_gmm.py @@ -6,17 +6,19 @@ from linearmodels.tests.asset_pricing._utility import generate_data, get_all -@pytest.fixture(params=['numpy', 'pandas']) +@pytest.fixture(params=["numpy", "pandas"]) def data(request): return generate_data(nportfolio=10, output=request.param) def test_linear_model_gmm_moments_jacobian(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='robust', disp=0, debiased=False) - params = np.r_[res.betas.values.ravel(), - res.risk_premia.values.ravel(), - mod.factors.ndarray.mean(0)] + res = mod.fit(cov_type="robust", disp=0, debiased=False) + params = np.r_[ + res.betas.values.ravel(), + res.risk_premia.values.ravel(), + mod.factors.ndarray.mean(0), + ] mod_mom = mod._moments(params[:, None], True) mom = [] @@ -29,7 +31,7 @@ def test_linear_model_gmm_moments_jacobian(data): x = f - mu + lam b = res.betas.values for i in range(p.shape[1]): - eps = p[:, i:(i + 1)] - x @ b[[i]].T + eps = p[:, i : (i + 1)] - x @ b[[i]].T for j in range(fc.shape[1]): mom.append(eps * fc[:, [j]]) mom.append(f - mu) @@ -39,19 +41,19 @@ def test_linear_model_gmm_moments_jacobian(data): jac = np.zeros((mom.shape[1], params.shape[0])) nport, nf = p.shape[1], f.shape[1] # 1,1 - jac[:(nport * (nf + 1)), :nport * nf] = np.kron(np.eye(nport), fc.T @ x / n) + jac[: (nport * (nf + 1)), : nport * nf] = np.kron(np.eye(nport), fc.T @ x / n) # 1, 2 col = [] for i in range(nport): col.append(fc.T @ np.ones((n, 1)) @ b[[i]] / n) col = np.vstack(tuple(col)) - jac[:(nport * (nf + 1)), nport * nf:nport * nf + nf] = col + jac[: (nport * (nf + 1)), nport * nf : nport * nf + nf] = col # 1, 3 col = [] for i in range(nport): col.append(-fc.T @ np.ones((n, 1)) @ b[[i]] / n) col = np.vstack(tuple(col)) - jac[:(nport * (nf + 1)), -nf:] = col + jac[: (nport * (nf + 1)), -nf:] = col # 2,2 jac[-nf:, -nf:] = np.eye(nf) @@ -69,13 +71,13 @@ def test_linear_model_gmm_moments_jacobian(data): def test_linear_model_gmm_smoke_iterate(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='robust', disp=5, steps=20) + res = mod.fit(cov_type="robust", disp=5, steps=20) get_all(res) def test_linear_model_gmm_smoke_risk_free(data): mod = LinearFactorModelGMM(data.portfolios, data.factors, risk_free=True) - res = mod.fit(cov_type='robust', disp=10) + res = mod.fit(cov_type="robust", disp=10) get_all(res) str(res._cov_est) res._cov_est.__repr__() @@ -84,7 +86,7 @@ def test_linear_model_gmm_smoke_risk_free(data): def test_linear_model_gmm_kernel_smoke(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='kernel', disp=10) + res = mod.fit(cov_type="kernel", disp=10) get_all(res) str(res._cov_est) res._cov_est.__repr__() @@ -93,11 +95,11 @@ def test_linear_model_gmm_kernel_smoke(data): def test_linear_model_gmm_kernel_bandwidth_smoke(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='kernel', bandwidth=10, disp=10) + res = mod.fit(cov_type="kernel", bandwidth=10, disp=10) get_all(res) def test_linear_model_gmm_cue_smoke(data): mod = LinearFactorModelGMM(data.portfolios, data.factors, risk_free=True) - res = mod.fit(cov_type='robust', disp=10, use_cue=True) + res = mod.fit(cov_type="robust", disp=10, use_cue=True) get_all(res) diff --git a/linearmodels/tests/asset_pricing/test_linear_factor_model.py b/linearmodels/tests/asset_pricing/test_linear_factor_model.py index 1170cb8139..975873bcf7 100644 --- a/linearmodels/tests/asset_pricing/test_linear_factor_model.py +++ b/linearmodels/tests/asset_pricing/test_linear_factor_model.py @@ -11,7 +11,7 @@ from linearmodels.tests.asset_pricing._utility import generate_data, get_all -@pytest.fixture(params=['numpy', 'pandas']) +@pytest.fixture(params=["numpy", "pandas"]) def data(request): return generate_data(nportfolio=10, output=request.param) @@ -64,7 +64,7 @@ def test_linear_model_parameters(data): block[j, k] = b[i][j] * lam[k - 1] if j + 1 == k: block[j, k] -= alphas[i] - jac[block1:block2, loc:loc + nf + 1] = block + jac[block1:block2, loc : loc + nf + 1] = block loc += nf + 1 # 2, 2 jac[block1:block2, block1:block2] = b.T @ b @@ -77,14 +77,16 @@ def test_linear_model_parameters(data): block[row, col] = lam[j - 1] col += 1 row += 1 - jac[-nport:, :(nport * (nf + 1))] = block + jac[-nport:, : (nport * (nf + 1))] = block # 3, 2 - jac[-nport:, (nport * (nf + 1)):(nport * (nf + 1)) + nf] = b + jac[-nport:, (nport * (nf + 1)) : (nport * (nf + 1)) + nf] = b # 3, 3: already done since eye mod_jac = mod._jacobian(b, lam, alphas) assert_allclose(mod_jac[:block1], jac[:block1]) assert_allclose(mod_jac[block1:block2, :block1], jac[block1:block2, :block1]) - assert_allclose(mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2]) + assert_allclose( + mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2] + ) assert_allclose(mod_jac[block1:block2, block2:], jac[block1:block2, block2:]) assert_allclose(mod_jac[block2:], jac[block2:]) @@ -100,14 +102,16 @@ def test_linear_model_parameters(data): cov = (cov + cov.T) / 2 assert_allclose(cov, res.cov) - acov = cov[:block1:(nf + 1), :block1:(nf + 1)] + acov = cov[: block1 : (nf + 1), : block1 : (nf + 1)] jstat = float(alphas.T @ np.linalg.pinv(acov) @ alphas) assert_allclose(res.j_statistic.stat, jstat) assert_allclose(res.j_statistic.pval, 1 - stats.chi2(nport - nf).cdf(jstat)) get_all(res) - res = LinearFactorModel(data.portfolios, data.factors).fit(cov_type='kernel', debiased=False) + res = LinearFactorModel(data.portfolios, data.factors).fit( + cov_type="kernel", debiased=False + ) std_mom = moments / moments.std(0)[None, :] mom = std_mom.sum(1) bw = kernel_optimal_bandwidth(mom) @@ -168,7 +172,7 @@ def test_linear_model_parameters_risk_free(data): block[j, k] = bc[i][j] * lam[k] if j == k: block[j, k] -= alphas[i] - jac[block1:block2, loc:loc + nf + 1] = block + jac[block1:block2, loc : loc + nf + 1] = block loc += nf + 1 # 2, 2 jac[block1:block2, block1:block2] = bc.T @ bc @@ -181,14 +185,16 @@ def test_linear_model_parameters_risk_free(data): block[row, col] = lam[j] col += 1 row += 1 - jac[-nport:, :(nport * (nf + 1))] = block + jac[-nport:, : (nport * (nf + 1))] = block # 3, 2 - jac[-nport:, (nport * (nf + 1)):(nport * (nf + 1)) + nf + 1] = bc + jac[-nport:, (nport * (nf + 1)) : (nport * (nf + 1)) + nf + 1] = bc # 3, 3: already done since eye mod_jac = mod._jacobian(bc, lam, alphas) assert_allclose(mod_jac[:block1], jac[:block1]) assert_allclose(mod_jac[block1:block2, :block1], jac[block1:block2, :block1]) - assert_allclose(mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2]) + assert_allclose( + mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2] + ) assert_allclose(mod_jac[block1:block2, block2:], jac[block1:block2, block2:]) assert_allclose(mod_jac[block2:], jac[block2:]) @@ -204,11 +210,13 @@ def test_linear_model_parameters_risk_free(data): cov = (cov + cov.T) / 2 assert_allclose(cov, res.cov) - acov = cov[:block1:(nf + 1), :block1:(nf + 1)] + acov = cov[: block1 : (nf + 1), : block1 : (nf + 1)] jstat = float(alphas.T @ np.linalg.pinv(acov) @ alphas) - assert_allclose(res.cov.values[:block1:(nf + 1), :block1:(nf + 1)], acov) + assert_allclose(res.cov.values[: block1 : (nf + 1), : block1 : (nf + 1)], acov) assert_allclose(res.j_statistic.stat, jstat, rtol=1e-1) - assert_allclose(res.j_statistic.pval, 1 - stats.chi2(nport - nf - 1).cdf(jstat), rtol=1e-2) + assert_allclose( + res.j_statistic.pval, 1 - stats.chi2(nport - nf - 1).cdf(jstat), rtol=1e-2 + ) get_all(res) @@ -222,7 +230,7 @@ def test_linear_model_parameters_risk_free_gls(data): sigma_inv = np.linalg.inv(sigma) mod = LinearFactorModel(data.portfolios, data.factors, risk_free=True, sigma=sigma) - assert 'using GLS' in str(mod) + assert "using GLS" in str(mod) res = mod.fit() f = mod.factors.ndarray p = mod.portfolios.ndarray @@ -272,7 +280,7 @@ def test_linear_model_parameters_risk_free_gls(data): block[j, k] = bct[i][j] * lam[k] if j == k: block[j, k] -= at[i] - jac[block1:block2, loc:loc + nf + 1] = block + jac[block1:block2, loc : loc + nf + 1] = block loc += nf + 1 # 2, 2 jac[block1:block2, block1:block2] = bc.T @ sigma_inv @ bc @@ -285,14 +293,16 @@ def test_linear_model_parameters_risk_free_gls(data): block[row, col] = lam[j] col += 1 row += 1 - jac[-nport:, :(nport * (nf + 1))] = block + jac[-nport:, : (nport * (nf + 1))] = block # 3, 2 - jac[-nport:, (nport * (nf + 1)):(nport * (nf + 1)) + nf + 1] = bc + jac[-nport:, (nport * (nf + 1)) : (nport * (nf + 1)) + nf + 1] = bc # 3, 3: already done since eye mod_jac = mod._jacobian(bc, lam, alphas) assert_allclose(mod_jac[:block1], jac[:block1]) assert_allclose(mod_jac[block1:block2, :block1], jac[block1:block2, :block1]) - assert_allclose(mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2]) + assert_allclose( + mod_jac[block1:block2, block1:block2], jac[block1:block2, block1:block2] + ) assert_allclose(mod_jac[block1:block2, block2:], jac[block1:block2, block2:]) assert_allclose(mod_jac[block2:], jac[block2:]) @@ -308,16 +318,18 @@ def test_linear_model_parameters_risk_free_gls(data): cov = (cov + cov.T) / 2 assert_allclose(cov, res.cov) - acov = cov[:block1:(nf + 1), :block1:(nf + 1)] + acov = cov[: block1 : (nf + 1), : block1 : (nf + 1)] jstat = float(alphas.T @ np.linalg.pinv(acov) @ alphas) - assert_allclose(res.cov.values[:block1:(nf + 1), :block1:(nf + 1)], acov) + assert_allclose(res.cov.values[: block1 : (nf + 1), : block1 : (nf + 1)], acov) assert_allclose(res.j_statistic.stat, jstat, rtol=1e-1) - assert_allclose(res.j_statistic.pval, 1 - stats.chi2(nport - nf - 1).cdf(jstat), rtol=1e-2) + assert_allclose( + res.j_statistic.pval, 1 - stats.chi2(nport - nf - 1).cdf(jstat), rtol=1e-2 + ) get_all(res) -@pytest.mark.parametrize('output', ['numpy', 'pandas']) +@pytest.mark.parametrize("output", ["numpy", "pandas"]) def test_infeasible(output): data = generate_data(nfactor=10, nportfolio=20, nobs=10, output=output) with pytest.raises(ValueError): diff --git a/linearmodels/tests/asset_pricing/test_model.py b/linearmodels/tests/asset_pricing/test_model.py index becb322a4f..100dbcd873 100644 --- a/linearmodels/tests/asset_pricing/test_model.py +++ b/linearmodels/tests/asset_pricing/test_model.py @@ -12,47 +12,49 @@ from linearmodels.iv.model import _OLS from linearmodels.tests.asset_pricing._utility import generate_data, get_all -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) -@pytest.fixture(params=['numpy', 'pandas']) +@pytest.fixture(params=["numpy", "pandas"]) def data(request): return generate_data(nportfolio=10, output=request.param) def test_linear_model_gmm_smoke(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='robust', disp=5) + res = mod.fit(cov_type="robust", disp=5) get_all(res) def test_linear_model_gmm_smoke_iterate(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='robust', disp=5, steps=20) + res = mod.fit(cov_type="robust", disp=5, steps=20) get_all(res) def test_linear_model_gmm_smoke_risk_free(data): mod = LinearFactorModelGMM(data.portfolios, data.factors, risk_free=True) - res = mod.fit(cov_type='robust', disp=10) + res = mod.fit(cov_type="robust", disp=10) get_all(res) def test_linear_model_gmm_kernel_smoke(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='kernel', disp=10) + res = mod.fit(cov_type="kernel", disp=10) get_all(res) def test_linear_model_gmm_kernel_bandwidth_smoke(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - res = mod.fit(cov_type='kernel', bandwidth=10, disp=10) + res = mod.fit(cov_type="kernel", bandwidth=10, disp=10) get_all(res) def test_linear_model_gmm_cue_smoke(data): mod = LinearFactorModelGMM(data.portfolios, data.factors, risk_free=True) - res = mod.fit(cov_type='robust', disp=10, use_cue=True) + res = mod.fit(cov_type="robust", disp=10, use_cue=True) get_all(res) @@ -71,16 +73,18 @@ def test_linear_model_time_series(data): loc = 0 for i in range(data.portfolios.shape[1]): if isinstance(data.portfolios, pd.DataFrame): - p = data.portfolios.iloc[:, i:(i + 1)] + p = data.portfolios.iloc[:, i : (i + 1)] else: - p = data.portfolios[:, i:(i + 1)] - ols_res = _OLS(p, factors).fit(cov_type='robust', debiased=True) + p = data.portfolios[:, i : (i + 1)] + ols_res = _OLS(p, factors).fit(cov_type="robust", debiased=True) all_params.extend(list(ols_res.params)) all_tstats.extend(list(ols_res.tstats)) - x[:, loc:(loc + nf + 1)] = factors - e[:, loc:(loc + nf + 1)] = ols_res.resids.values[:, None] + x[:, loc : (loc + nf + 1)] = factors + e[:, loc : (loc + nf + 1)] = ols_res.resids.values[:, None] loc += nf + 1 - cov = res.cov.values[(nf + 1) * i:(nf + 1) * (i + 1), (nf + 1) * i:(nf + 1) * (i + 1)] + cov = res.cov.values[ + (nf + 1) * i : (nf + 1) * (i + 1), (nf + 1) * i : (nf + 1) * (i + 1) + ] ols_cov = ols_res.cov.values assert_allclose(cov, ols_cov) @@ -102,7 +106,7 @@ def test_linear_model_time_series(data): assert_allclose(cov, res.cov.values) alphas = np.array(all_params)[0::nfp1][:, None] - alpha_cov = cov[0:(nfp1 * nport):nfp1, 0:(nfp1 * nport):nfp1] + alpha_cov = cov[0 : (nfp1 * nport) : nfp1, 0 : (nfp1 * nport) : nfp1] stat_direct = float(alphas.T @ np.linalg.inv(alpha_cov) @ alphas) assert_allclose(res.j_statistic.stat, stat_direct) assert_allclose(1.0 - stats.chi2.cdf(stat_direct, nport), res.j_statistic.pval) @@ -110,13 +114,13 @@ def test_linear_model_time_series(data): def test_linear_model_time_series_kernel_smoke(data): mod = TradedFactorModel(data.portfolios, data.factors) - mod.fit(cov_type='kernel') + mod.fit(cov_type="kernel") def test_linear_model_time_series_error(data): mod = TradedFactorModel(data.portfolios, data.factors) with pytest.raises(ValueError): - mod.fit(cov_type='unknown') + mod.fit(cov_type="unknown") def test_errors(data): @@ -126,26 +130,26 @@ def test_errors(data): p2 = p.copy() p3 = p.copy().iloc[:-1] p4 = p.copy() - p5 = p.copy().iloc[:f.shape[1] - 1, :1] - p4 = p4.iloc[:, :(f.shape[1] - 1)] - p2['dupe'] = p.iloc[:, 0] - p['const'] = 1.0 + p5 = p.copy().iloc[: f.shape[1] - 1, :1] + p4 = p4.iloc[:, : (f.shape[1] - 1)] + p2["dupe"] = p.iloc[:, 0] + p["const"] = 1.0 f5 = f.copy() - f5 = f5.iloc[:p5.shape[0]] + f5 = f5.iloc[: p5.shape[0]] f2 = f.copy() - f2['dupe'] = f.iloc[:, 0] - f['const'] = 1.0 + f2["dupe"] = f.iloc[:, 0] + f["const"] = 1.0 else: p2 = np.c_[p, p[:, [0]]] p3 = p.copy()[:-1] p4 = p.copy() - p5 = p.copy()[:f.shape[1] - 1, :1] - p4 = p4[:, :(f.shape[1] - 1)] + p5 = p.copy()[: f.shape[1] - 1, :1] + p4 = p4[:, : (f.shape[1] - 1)] p = np.c_[np.ones((p.shape[0], 1)), p] f5 = f.copy() - f5 = f5[:p5.shape[0]] + f5 = f5[: p5.shape[0]] f2 = np.c_[f, f[:, [0]]] f = np.c_[np.ones((f.shape[0], 1)), f] @@ -187,10 +191,10 @@ def test_drop_missing(data): def test_unknown_kernel(data): mod = LinearFactorModel(data.portfolios, data.factors) with pytest.raises(ValueError): - mod.fit(cov_type='unknown') + mod.fit(cov_type="unknown") mod = LinearFactorModelGMM(data.portfolios, data.factors) with pytest.raises(ValueError): - mod.fit(cov_type='unknown') + mod.fit(cov_type="unknown") def test_all_missing(): @@ -202,13 +206,13 @@ def test_all_missing(): def test_repr(data): mod = LinearFactorModelGMM(data.portfolios, data.factors) - assert 'LinearFactorModelGMM' in mod.__repr__() - assert str(data.portfolios.shape[1]) + ' test portfolios' in mod.__repr__() - assert str(data.factors.shape[1]) + ' factors' in mod.__repr__() + assert "LinearFactorModelGMM" in mod.__repr__() + assert str(data.portfolios.shape[1]) + " test portfolios" in mod.__repr__() + assert str(data.factors.shape[1]) + " factors" in mod.__repr__() mod = LinearFactorModel(data.portfolios, data.factors, risk_free=True) - assert 'LinearFactorModel' in mod.__repr__() - assert 'Estimated risk-free' in mod.__repr__() - assert 'True' in mod.__repr__() + assert "LinearFactorModel" in mod.__repr__() + assert "Estimated risk-free" in mod.__repr__() + assert "True" in mod.__repr__() mod = TradedFactorModel(data.portfolios, data.factors) - assert 'TradedFactorModel' in mod.__repr__() + assert "TradedFactorModel" in mod.__repr__() assert str(hex(id(mod))) in mod.__repr__() diff --git a/linearmodels/tests/datasets/test_datasets.py b/linearmodels/tests/datasets/test_datasets.py index d25011d42d..02a40442fb 100644 --- a/linearmodels/tests/datasets/test_datasets.py +++ b/linearmodels/tests/datasets/test_datasets.py @@ -5,9 +5,20 @@ fringe, jobtraining, meps, mroz, munnell, wage, wage_panel) -DATASETS = [birthweight, card, fertility, french, fringe, - jobtraining, meps, mroz, munnell, wage, wage_panel] -ids = list(map(lambda x: x.__name__.split('.')[-1], DATASETS)) +DATASETS = [ + birthweight, + card, + fertility, + french, + fringe, + jobtraining, + meps, + mroz, + munnell, + wage, + wage_panel, +] +ids = list(map(lambda x: x.__name__.split(".")[-1], DATASETS)) @pytest.fixture(params=DATASETS, ids=ids) diff --git a/linearmodels/tests/iv/_utility.py b/linearmodels/tests/iv/_utility.py index d75aaccf7b..3083627cc6 100644 --- a/linearmodels/tests/iv/_utility.py +++ b/linearmodels/tests/iv/_utility.py @@ -17,11 +17,11 @@ def generate_data(nkp=(1000, 5, 3)): v = np.random.multivariate_normal(np.zeros(r.shape[0]), r, n) x = v[:, :k] - z = v[:, 2:k + p] + z = v[:, 2 : k + p] e = v[:, [-1]] endog = x[:, :2] exog = x[:, 2:] - instr = z[:, k - 2:] + instr = z[:, k - 2 :] params = np.arange(1, k + 1) / k params = params[:, None] y = x @ params + e @@ -37,9 +37,27 @@ def generate_data(nkp=(1000, 5, 3)): xzizx = x.T @ z @ z.T @ x / nobs xzizx_inv = np.linalg.inv(xzizx) - return AttrDict(nobs=nobs, e=e, x=x, y=y, z=z, xhat=xhat, - params=params, s2=s2, s2_debiased=s2_debiased, - clusters=clusters, nvar=nvar, v=v, vinv=vinv, vk=vk, - i=np.eye(k + p - 2), kappa=kappa, - xzizx=xzizx, xzizx_inv=xzizx_inv, - dep=dep, exog=exog, endog=endog, instr=instr) + return AttrDict( + nobs=nobs, + e=e, + x=x, + y=y, + z=z, + xhat=xhat, + params=params, + s2=s2, + s2_debiased=s2_debiased, + clusters=clusters, + nvar=nvar, + v=v, + vinv=vinv, + vk=vk, + i=np.eye(k + p - 2), + kappa=kappa, + xzizx=xzizx, + xzizx_inv=xzizx_inv, + dep=dep, + exog=exog, + endog=endog, + instr=instr, + ) diff --git a/linearmodels/tests/iv/results/execute-stata-simulated-data.py b/linearmodels/tests/iv/results/execute-stata-simulated-data.py index 676638dbb9..928cbec108 100644 --- a/linearmodels/tests/iv/results/execute-stata-simulated-data.py +++ b/linearmodels/tests/iv/results/execute-stata-simulated-data.py @@ -3,48 +3,74 @@ from os.path import join import subprocess -STATA_PATH = join('C:\\', 'Program Files (x86)', 'Stata13', 'StataMP-64.exe') +STATA_PATH = join("C:\\", "Program Files (x86)", "Stata13", "StataMP-64.exe") -dtafile = join(os.getcwd(), 'simulated-data.dta') +dtafile = join(os.getcwd(), "simulated-data.dta") start = """ use {dtafile}, clear \n tsset time \n -""".format(dtafile=dtafile) +""".format( + dtafile=dtafile +) model = r""" ivregress {method} {depvar} {exog_var} /// ({endog_var} = {instr}) {weight_opt}, {variance_option} {other_option} """ -methods = ['2sls', 'liml', 'gmm'] -depvars = ['y_unadjusted', 'y_robust', 'y_clustered', 'y_kernel'] -variance_options = ['vce(unadjusted)', 'vce(robust)', 'vce(cluster cluster_id)', - 'vce(hac bartlett 12)'] +methods = ["2sls", "liml", "gmm"] +depvars = ["y_unadjusted", "y_robust", "y_clustered", "y_kernel"] +variance_options = [ + "vce(unadjusted)", + "vce(robust)", + "vce(cluster cluster_id)", + "vce(hac bartlett 12)", +] depvar_with_var = list(zip(depvars, variance_options)) -exog_vars = ['', 'x3 x4 x5'] -endog_vars = ['x1', 'x1 x2'] -instr = ['z1', 'z1 z2'] -other_options = ['', 'small', 'noconstant', 'small noconstant', 'small center', - 'center', 'center noconstant', 'small center noconstant'] -weight_options = [' ', ' [aweight=weights] '] -inputs = [methods, depvar_with_var, exog_vars, endog_vars, instr, other_options, weight_options] +exog_vars = ["", "x3 x4 x5"] +endog_vars = ["x1", "x1 x2"] +instr = ["z1", "z1 z2"] +other_options = [ + "", + "small", + "noconstant", + "small noconstant", + "small center", + "center", + "center noconstant", + "small center noconstant", +] +weight_options = [" ", " [aweight=weights] "] +inputs = [ + methods, + depvar_with_var, + exog_vars, + endog_vars, + instr, + other_options, + weight_options, +] configs = [] for val in product(*inputs): method, dvo, exog, endog, instr, other_opt, weight_opt = val depvar, var_opt = dvo - if (len(endog) > len(instr)) or (other_opt.find('center') >= 0 and method != 'gmm'): + if (len(endog) > len(instr)) or (other_opt.find("center") >= 0 and method != "gmm"): continue - if method == 'gmm': - var_opt = var_opt.replace('vce', 'wmatrix') - - configs.append({'method': method, - 'depvar': depvar, - 'exog_var': exog, - 'endog_var': endog, - 'instr': instr, - 'variance_option': var_opt, - 'other_option': other_opt, - 'weight_opt': weight_opt}) + if method == "gmm": + var_opt = var_opt.replace("vce", "wmatrix") + + configs.append( + { + "method": method, + "depvar": depvar, + "exog_var": exog, + "endog_var": endog, + "instr": instr, + "variance_option": var_opt, + "other_option": other_opt, + "weight_opt": weight_opt, + } + ) results = """ estout using {outfile}, cells(b(fmt(%13.12g)) t(fmt(%13.12g))) """ @@ -65,8 +91,8 @@ estout matrix(W, fmt(%13.12g)) using {outfile}, append """ -m = '{method}-num_endog_{num_endog}-num_exog_{num_exog}-num_instr_{num_instr}' -m = m + '-weighted_{weighted}-{variance}-{other}' +m = "{method}-num_endog_{num_endog}-num_exog_{num_exog}-num_instr_{num_instr}" +m = m + "-weighted_{weighted}-{variance}-{other}" section_header = """ file open myfile using {outfile}, write append file write myfile _n _n "########## !""" @@ -75,45 +101,47 @@ file close myfile """ -outfile = os.path.join(os.getcwd(), 'stata-iv-simulated-results.txt') +outfile = os.path.join(os.getcwd(), "stata-iv-simulated-results.txt") if os.path.exists(outfile): os.unlink(outfile) def count_vars(v): - if v.strip() == '': + if v.strip() == "": return 0 v = v.strip() - while ' ' in v: - v = v.replace(' ', ' ') - return len(v.split(' ')) + while " " in v: + v = v.replace(" ", " ") + return len(v.split(" ")) -with open('simulated-results.do', 'w') as stata: +with open("simulated-results.do", "w") as stata: stata.write(start) for config in configs: - sec_header = {'method': config['method'], - 'num_endog': count_vars(config['endog_var']), - 'num_exog': count_vars(config['exog_var']), - 'num_instr': count_vars(config['instr']), - 'variance': config['variance_option'], - 'other': config['other_option'].replace(' ', '_'), - 'outfile': outfile, - 'weighted': 'aweight' in config['weight_opt']} + sec_header = { + "method": config["method"], + "num_endog": count_vars(config["endog_var"]), + "num_exog": count_vars(config["exog_var"]), + "num_instr": count_vars(config["instr"]), + "variance": config["variance_option"], + "other": config["other_option"].replace(" ", "_"), + "outfile": outfile, + "weighted": "aweight" in config["weight_opt"], + } stata.write(section_header.format(**sec_header)) stata.write(model.format(**config)) - small = config['other_option'].find('small') >= 0 - extra = ' J ' if config['method'] == 'gmm' else ' kappa ' - extra += ' F p ' if small else ' chi2 p ' + small = config["other_option"].find("small") >= 0 + extra = " J " if config["method"] == "gmm" else " kappa " + extra += " F p " if small else " chi2 p " stata.write(results.format(outfile=outfile, extra=extra)) - if config['method'] == 'gmm': + if config["method"] == "gmm": stata.write(gmm_extra.format(outfile=outfile)) - stata.write('\n') + stata.write("\n") -do_file = join(os.getcwd(), 'simulated-results.do') -cmd = [STATA_PATH, '/e', 'do', do_file] -print(' '.join(cmd)) +do_file = join(os.getcwd(), "simulated-results.do") +cmd = [STATA_PATH, "/e", "do", do_file] +print(" ".join(cmd)) subprocess.call(cmd) diff --git a/linearmodels/tests/iv/results/execute-stata.py b/linearmodels/tests/iv/results/execute-stata.py index cd2d39eafe..2b01f1e340 100644 --- a/linearmodels/tests/iv/results/execute-stata.py +++ b/linearmodels/tests/iv/results/execute-stata.py @@ -2,7 +2,7 @@ from os.path import join import subprocess -STATA_PATH = join('C:\\', 'Program Files (x86)', 'Stata13', 'StataMP-64.exe') +STATA_PATH = join("C:\\", "Program Files (x86)", "Stata13", "StataMP-64.exe") start = """ use http://www.stata-press.com/data/r13/hsng, clear \n @@ -33,36 +33,40 @@ file close myfile """ -methods = ['2sls', 'liml', 'gmm'] -outfile = os.path.join(os.getcwd(), 'stata-iv-housing-results.txt') +methods = ["2sls", "liml", "gmm"] +outfile = os.path.join(os.getcwd(), "stata-iv-housing-results.txt") if os.path.exists(outfile): os.unlink(outfile) -variance_options = [', vce(unadjusted)', ', vce(robust)', ', vce(cluster division)'] -descr = ['unadjusted', 'robust', 'cluster'] +variance_options = [", vce(unadjusted)", ", vce(robust)", ", vce(cluster division)"] +descr = ["unadjusted", "robust", "cluster"] -with open('temp.do', 'w') as stata: +with open("temp.do", "w") as stata: stata.write(start) for small in (True, False): for method in methods: for vo, desc in zip(variance_options, descr): - small_text = 'small' if small else 'asymptotic' - stata.write(section_header.format(outfile=outfile, method=method, desc=desc, - small=small_text)) - desc += '-small' if small else '' - vo += ' small' if small else '' + small_text = "small" if small else "asymptotic" + stata.write( + section_header.format( + outfile=outfile, method=method, desc=desc, small=small_text + ) + ) + desc += "-small" if small else "" + vo += " small" if small else "" of = outfile.format(method=method, descr=desc) - extra = ' J ' if method == 'gmm' else ' kappa ' - extra += ' F p ' if small else ' chi2 p ' - cmd = iv_tempplate.format(outfile=of, variance_option=vo, method=method, - extra=extra) - if 'gmm' in method: - cmd = cmd.replace('vce', 'wmatrix') + extra = " J " if method == "gmm" else " kappa " + extra += " F p " if small else " chi2 p " + cmd = iv_tempplate.format( + outfile=of, variance_option=vo, method=method, extra=extra + ) + if "gmm" in method: + cmd = cmd.replace("vce", "wmatrix") stata.write(cmd) - if 'gmm' in method: + if "gmm" in method: stata.write(gmm_extra.format(outfile=of)) - stata.write('\n') + stata.write("\n") -do_file = join(os.getcwd(), 'temp.do') -stata_cmd = [STATA_PATH, '/e', 'do', do_file] -print(' '.join(stata_cmd)) +do_file = join(os.getcwd(), "temp.do") +stata_cmd = [STATA_PATH, "/e", "do", do_file] +print(" ".join(stata_cmd)) subprocess.call(stata_cmd) diff --git a/linearmodels/tests/iv/results/read_stata_results.py b/linearmodels/tests/iv/results/read_stata_results.py index 535ee7c95a..d7572e0b7e 100644 --- a/linearmodels/tests/iv/results/read_stata_results.py +++ b/linearmodels/tests/iv/results/read_stata_results.py @@ -9,32 +9,32 @@ def repl_const(df): index = list(df.index) replace_cols = list(df.columns) == index for i, v in enumerate(index): - if v == '_cons': - index[i] = 'const' + if v == "_cons": + index[i] = "const" df.index = index if replace_cols: df.columns = index for c in df: - df[c] = pd.to_numeric(df[c], errors='coerce') + df[c] = pd.to_numeric(df[c], errors="coerce") return df def parse_file(name): blocks = defaultdict(list) - current_key = '' - with open(name, 'r') as stata: + current_key = "" + with open(name, "r") as stata: for line in stata: - if line.strip() == '': + if line.strip() == "": continue - if line.startswith('###'): - current_key = line.split('!')[1] + if line.startswith("###"): + current_key = line.split("!")[1] continue blocks[current_key].append(line) return blocks def parse_block(block): - block = [l.strip().split('\t') for l in block] + block = [l.strip().split("\t") for l in block] params = [] cov = [] weight_mat = [] @@ -44,33 +44,33 @@ def parse_block(block): if len(line) == 2: params.append(line) elif len(line) == 1: - if line[0].startswith('***'): + if line[0].startswith("***"): break try: float(line[0]) params[-1].append(line[0]) except ValueError: pass - params = pd.DataFrame(params, columns=['variable', 'params', 'tstats']) - params = repl_const(params.set_index('variable')) - stats = params.loc[params.tstats.isnull(), 'params'] + params = pd.DataFrame(params, columns=["variable", "params", "tstats"]) + params = repl_const(params.set_index("variable")) + stats = params.loc[params.tstats.isnull(), "params"] params = params.loc[params.tstats.notnull()] - for line in block[last + 2:]: - if len(line) == 1 and line[0].startswith('***'): + for line in block[last + 2 :]: + if len(line) == 1 and line[0].startswith("***"): break cov.append(line) - cov[0].insert(0, 'variable') + cov[0].insert(0, "variable") last += i + 2 cov = pd.DataFrame(cov[1:], columns=cov[0]) - cov = repl_const(cov.set_index('variable')) + cov = repl_const(cov.set_index("variable")) if len(block) > (last + 1): - weight_mat = block[last + 2:] - weight_mat[0].insert(0, 'variable') + weight_mat = block[last + 2 :] + weight_mat[0].insert(0, "variable") weight_mat = pd.DataFrame(weight_mat[1:], columns=weight_mat[0]) - weight_mat = repl_const(weight_mat.set_index('variable')) + weight_mat = repl_const(weight_mat.set_index("variable")) return AttrDict(params=params, cov=cov, weight_mat=weight_mat, stats=stats) @@ -78,21 +78,28 @@ def parse_block(block): def finalize(params, stats, cov, weight_mat): tstats = params.tstats params = params.params - out = AttrDict(params=params, tstats=tstats, stats=stats, cov=cov, weight_mat=weight_mat) + out = AttrDict( + params=params, tstats=tstats, stats=stats, cov=cov, weight_mat=weight_mat + ) for key in stats.index: out[key] = stats[key] - fixes = {'model_ss': 'mss', 'resid_ss': 'rss', 'rsquared': 'r2', 'rsquared_adj': 'r2_a'} + fixes = { + "model_ss": "mss", + "resid_ss": "rss", + "rsquared": "r2", + "rsquared_adj": "r2_a", + } for key in fixes: if fixes[key] in out: out[key] = out[fixes[key]] else: out[key] = None - if 'chi2' in out: - out['f_statistic'] = out['chi2'] - elif 'F' in out: - out['f_statistic'] = out['F'] + if "chi2" in out: + out["f_statistic"] = out["chi2"] + elif "F" in out: + out["f_statistic"] = out["F"] else: - out['f_statistic'] = None + out["f_statistic"] = None return out @@ -105,10 +112,10 @@ def process_results(filename): return blocks -if __name__ == '__main__': +if __name__ == "__main__": import os - blocks = parse_file(os.path.join(os.getcwd(), 'stata-iv-simulated-results.txt')) + blocks = parse_file(os.path.join(os.getcwd(), "stata-iv-simulated-results.txt")) for key in blocks: out = parse_block(blocks[key]) - finalize(out['params'], out['stats'], out['cov'], out['weight_mat']).keys() + finalize(out["params"], out["stats"], out["cov"], out["weight_mat"]).keys() diff --git a/linearmodels/tests/iv/results/simulated-test-data.py b/linearmodels/tests/iv/results/simulated-test-data.py index 729f5727da..e5eea4d5c8 100644 --- a/linearmodels/tests/iv/results/simulated-test-data.py +++ b/linearmodels/tests/iv/results/simulated-test-data.py @@ -24,13 +24,13 @@ k, p, n = 5, 2, 600 r = np.empty((k + p + 1, k + p + 1)) r[:, :] = 0.5 -r[p:k + p, -1] = r[-1, p:k + 1 + p] = 0 +r[p : k + p, -1] = r[-1, p : k + 1 + p] = 0 r[-1, -1] = 0.5 r += 0.5 * np.eye(k + p + 1) w = multivariate_normal(np.zeros(k + p + 1), r, n) x = w[:, :k] -z = w[:, k:k + p] +z = w[:, k : k + p] e = w[:, -1] x = add_constant(x) beta = np.arange(k + 1) / k @@ -52,7 +52,7 @@ r += 0.5 * np.eye(cluster_size) rsqrt = np.linalg.cholesky(r) for i in range(0, len(r), 5): - e[i:i + 5] = (rsqrt @ e[i:i + 5][:, None]).squeeze() + e[i : i + 5] = (rsqrt @ e[i : i + 5][:, None]).squeeze() e_cluster = e clusters = np.tile(np.arange(n // 5)[None, :], (5, 1)).T.ravel() @@ -70,10 +70,43 @@ weights = weights / weights.mean() time = np.arange(n) -data = np.c_[time, y_unadjusted, y_robust, y_clustered, y_kernel, x, z, e_homo, e_hetero, - e_cluster, e_autoc, clusters, weights] -data = pd.DataFrame(data, columns=['time', 'y_unadjusted', 'y_robust', 'y_clustered', - 'y_kernel', '_cons', 'x1', 'x2', 'x3', - 'x4', 'x5', 'z1', 'z2', 'e_homo', 'e_hetero', 'e_cluster', - 'e_autoc', 'cluster_id', 'weights']) -data.to_stata('simulated-data.dta') +data = np.c_[ + time, + y_unadjusted, + y_robust, + y_clustered, + y_kernel, + x, + z, + e_homo, + e_hetero, + e_cluster, + e_autoc, + clusters, + weights, +] +data = pd.DataFrame( + data, + columns=[ + "time", + "y_unadjusted", + "y_robust", + "y_clustered", + "y_kernel", + "_cons", + "x1", + "x2", + "x3", + "x4", + "x5", + "z1", + "z2", + "e_homo", + "e_hetero", + "e_cluster", + "e_autoc", + "cluster_id", + "weights", + ], +) +data.to_stata("simulated-data.dta") diff --git a/linearmodels/tests/iv/test_absorbing.py b/linearmodels/tests/iv/test_absorbing.py index 2af481a4bc..a720c03bc3 100644 --- a/linearmodels/tests/iv/test_absorbing.py +++ b/linearmodels/tests/iv/test_absorbing.py @@ -23,7 +23,9 @@ from linearmodels.utility import AttrDict, MissingValueWarning NOBS = 100 -pytestmark = pytest.mark.filterwarnings('ignore:the matrix subclass:PendingDeprecationWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore:the matrix subclass:PendingDeprecationWarning" +) class Hasher(object): @@ -31,9 +33,11 @@ class Hasher(object): def hash_func(self): try: import xxhash + return xxhash.xxh64() except ImportError: import hashlib + return hashlib.sha1() def single(self, value): @@ -45,7 +49,7 @@ def single(self, value): hasher = Hasher() -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def rs(request): return np.random.RandomState(12345678) @@ -64,21 +68,26 @@ def random_cont(size, rs=None): return pd.DataFrame(series) -@pytest.fixture(scope='module', params=[1, 2, 3]) +@pytest.fixture(scope="module", params=[1, 2, 3]) def cat(request): rs = np.random.RandomState(0) return pd.DataFrame( - {str(i): random_cat(4, NOBS, rs=rs) for i in range(request.param)}) + {str(i): random_cat(4, NOBS, rs=rs) for i in range(request.param)} + ) -@pytest.fixture(scope='module', params=[1, 2]) +@pytest.fixture(scope="module", params=[1, 2]) def cont(request): rs = np.random.RandomState(0) return pd.DataFrame( - {'cont' + str(i): pd.Series(rs.standard_normal(NOBS)) for i in range(request.param)}) + { + "cont" + str(i): pd.Series(rs.standard_normal(NOBS)) + for i in range(request.param) + } + ) -@pytest.fixture(scope='module', params=[True, False]) +@pytest.fixture(scope="module", params=[True, False]) def weights(request): if not request.param: return None @@ -86,7 +95,7 @@ def weights(request): return rs.chisquare(10, NOBS) / 10.0 -@pytest.fixture(scope='module', params=[0, 1, 2]) +@pytest.fixture(scope="module", params=[0, 1, 2]) def interact(request): if not request.param: return None @@ -99,8 +108,18 @@ def interact(request): return interactions -def generate_data(k=3, const=True, nfactors=1, factor_density=10, nobs=2000, cont_interactions=1, - format='interaction', singleton_interaction=False, weighted=False, ncont=0): +def generate_data( + k=3, + const=True, + nfactors=1, + factor_density=10, + nobs=2000, + cont_interactions=1, + format="interaction", + singleton_interaction=False, + weighted=False, + ncont=0, +): rs = np.random.RandomState(1234567890) density = [factor_density] * max(nfactors, cont_interactions) x = rs.standard_normal((nobs, k)) @@ -122,9 +141,11 @@ def generate_data(k=3, const=True, nfactors=1, factor_density=10, nobs=2000, con if factors: factors = pd.concat(factors, 1) - if format == 'interaction': + if format == "interaction": if nfactors and ncont: - factors = Interaction(factors.iloc[:, :nfactors], factors.iloc[:, nfactors:]) + factors = Interaction( + factors.iloc[:, :nfactors], factors.iloc[:, nfactors:] + ) elif nfactors: factors = Interaction(factors, None) else: @@ -138,10 +159,12 @@ def generate_data(k=3, const=True, nfactors=1, factor_density=10, nobs=2000, con fact = rs.randint(ncat, size=(nobs)) effects = rs.standard_normal(nobs) y += effects - df = pd.DataFrame(pd.Series(pd.Categorical(fact)), columns=['fact{0}'.format(i)]) - df_eff = pd.DataFrame(effects[:, None], columns=['effect_{0}'.format(i)]) + df = pd.DataFrame( + pd.Series(pd.Categorical(fact)), columns=["fact{0}".format(i)] + ) + df_eff = pd.DataFrame(effects[:, None], columns=["effect_{0}".format(i)]) interactions.append(Interaction(df, df_eff)) - if format == 'pandas': + if format == "pandas": for i, interact in enumerate(interactions): interactions[i] = pd.concat([interact.cat, interact.cont], 1) interactions = interactions if interactions else None @@ -152,7 +175,9 @@ def generate_data(k=3, const=True, nfactors=1, factor_density=10, nobs=2000, con else: weights = None - return AttrDict(y=y, x=x, absorb=factors, interactions=interactions, weights=weights) + return AttrDict( + y=y, x=x, absorb=factors, interactions=interactions, weights=weights + ) # Permutations, k in (0,3), const in (True,False), factors=(0,1,2), interactions in (0,1) @@ -160,55 +185,66 @@ def generate_data(k=3, const=True, nfactors=1, factor_density=10, nobs=2000, con # k=3, const=True, nfactors=1, factor_density=10, nobs=2000, cont_interactions=1, # format='interaction', singleton_interaction=False -configs = product([0, 3], # k - [False, True], # constant - [1, 2, 0], # factors - [10], # density - [2000], # nobs - [0, 1], # cont interactions - ['interaction', 'pandas'], # format - [False, True], # singleton - [False, True], # weighted - [0, 1] # ncont - ) +configs = product( + [0, 3], # k + [False, True], # constant + [1, 2, 0], # factors + [10], # density + [2000], # nobs + [0, 1], # cont interactions + ["interaction", "pandas"], # format + [False, True], # singleton + [False, True], # weighted + [0, 1], # ncont +) configs = [c for c in configs if (c[2] or c[5] or c[9])] -id_str = 'k: {0}, const: {1}, nfactors: {2}, density: {3}, nobs: {4}, ' \ - 'cont_interacts: {5}, format:{6}, singleton:{7}, weighted: {8}, ncont: {9}' +id_str = ( + "k: {0}, const: {1}, nfactors: {2}, density: {3}, nobs: {4}, " + "cont_interacts: {5}, format:{6}, singleton:{7}, weighted: {8}, ncont: {9}" +) ids = [id_str.format(*config) for config in configs] -@pytest.fixture(scope='module', params=configs, ids=ids) +@pytest.fixture(scope="module", params=configs, ids=ids) def data(request): return generate_data(*request.param) -configs_ols = product([0, 3], # k - [False, True], # constant - [1, 2, 0], # factors - [50], # density - [500], # nobs - [0, 1], # cont interactions - ['interaction'], # format - [False], # singleton - [False, True], # weighted - [0, 1] # ncont - ) +configs_ols = product( + [0, 3], # k + [False, True], # constant + [1, 2, 0], # factors + [50], # density + [500], # nobs + [0, 1], # cont interactions + ["interaction"], # format + [False], # singleton + [False, True], # weighted + [0, 1], # ncont +) configs_ols = [c for c in configs_ols if (c[0] or c[1])] -id_str = 'k: {0}, const: {1}, nfactors: {2}, density: {3}, nobs: {4}, ' \ - 'cont_interacts: {5}, format:{6}, singleton:{7}, weighted: {8}, ncont: {9}' +id_str = ( + "k: {0}, const: {1}, nfactors: {2}, density: {3}, nobs: {4}, " + "cont_interacts: {5}, format:{6}, singleton:{7}, weighted: {8}, ncont: {9}" +) ids_ols = [id_str.format(*config) for config in configs_ols] -@pytest.fixture(scope='module', params=configs_ols, ids=ids_ols) +@pytest.fixture(scope="module", params=configs_ols, ids=ids_ols) def ols_data(request): return generate_data(*request.param) def test_smoke(data): - mod = AbsorbingLS(data.y, data.x, absorb=data.absorb, interactions=data.interactions, - weights=data.weights) + mod = AbsorbingLS( + data.y, + data.x, + absorb=data.absorb, + interactions=data.interactions, + weights=data.weights, + ) res = mod.fit() assert isinstance(res.summary, Summary) assert isinstance(str(res.summary), str) @@ -216,29 +252,44 @@ def test_smoke(data): def test_absorbing_exceptions(rs): with pytest.raises(TypeError): - AbsorbingLS(rs.standard_normal(NOBS), rs.standard_normal((NOBS, 2)), - absorb=rs.standard_normal((NOBS, 2))) + AbsorbingLS( + rs.standard_normal(NOBS), + rs.standard_normal((NOBS, 2)), + absorb=rs.standard_normal((NOBS, 2)), + ) with pytest.raises(ValueError): AbsorbingLS(rs.standard_normal(NOBS), rs.standard_normal((NOBS - 1, 2))) with pytest.raises(ValueError): - AbsorbingLS(rs.standard_normal(NOBS), rs.standard_normal((NOBS, 2)), - absorb=pd.DataFrame(rs.standard_normal((NOBS - 1, 1)))) + AbsorbingLS( + rs.standard_normal(NOBS), + rs.standard_normal((NOBS, 2)), + absorb=pd.DataFrame(rs.standard_normal((NOBS - 1, 1))), + ) with pytest.raises(ValueError): - AbsorbingLS(rs.standard_normal(NOBS), rs.standard_normal((NOBS, 2)), - interactions=random_cat(10, NOBS - 1, frame=True, rs=rs)) - mod = AbsorbingLS(rs.standard_normal(NOBS), rs.standard_normal((NOBS, 2)), - interactions=random_cat(10, NOBS, frame=True, rs=rs)) + AbsorbingLS( + rs.standard_normal(NOBS), + rs.standard_normal((NOBS, 2)), + interactions=random_cat(10, NOBS - 1, frame=True, rs=rs), + ) + mod = AbsorbingLS( + rs.standard_normal(NOBS), + rs.standard_normal((NOBS, 2)), + interactions=random_cat(10, NOBS, frame=True, rs=rs), + ) with pytest.raises(RuntimeError): mod.absorbed_dependent with pytest.raises(RuntimeError): mod.absorbed_exog with pytest.raises(TypeError): - AbsorbingLS(rs.standard_normal(NOBS), rs.standard_normal((NOBS, 2)), - interactions=rs.randint(0, 10, size=(NOBS, 2))) + AbsorbingLS( + rs.standard_normal(NOBS), + rs.standard_normal((NOBS, 2)), + interactions=rs.randint(0, 10, size=(NOBS, 2)), + ) def test_clear_cache(): - _VARIABLE_CACHE['key'] = 'value' + _VARIABLE_CACHE["key"] = "value" clear_cache() assert len(_VARIABLE_CACHE) == 0 @@ -248,15 +299,15 @@ def test_category_product(cat): if cat.shape[1] == 1: assert_series_equal(prod, cat.iloc[:, 0], check_names=False) else: - alt = cat.iloc[:, 0].astype('int64') + alt = cat.iloc[:, 0].astype("int64") for i in range(1, cat.shape[1]): - alt += 10 ** (4 * i) * cat.iloc[:, i].astype('int64') + alt += 10 ** (4 * i) * cat.iloc[:, i].astype("int64") alt = pd.Categorical(alt) alt = pd.Series(alt) - df = pd.DataFrame([prod.cat.codes, alt.cat.codes], index=['cat_prod', 'alt']).T - g = df.groupby('cat_prod').alt + df = pd.DataFrame([prod.cat.codes, alt.cat.codes], index=["cat_prod", "alt"]).T + g = df.groupby("cat_prod").alt assert (g.nunique() == 1).all() - g = df.groupby('alt').cat_prod + g = df.groupby("alt").cat_prod assert (g.nunique() == 1).all() @@ -291,7 +342,7 @@ def test_category_interaction(): def test_category_continuous_interaction(): c = pd.Series(pd.Categorical([0, 0, 0, 1, 1, 1])) - v = pd.Series(np.arange(6.)) + v = pd.Series(np.arange(6.0)) actual = category_continuous_interaction(c, v, precondition=False) expected = np.zeros((6, 2)) expected[:3, 0] = v[:3] @@ -307,7 +358,7 @@ def test_category_continuous_interaction(): def test_category_continuous_interaction_interwoven(): c = pd.Series(pd.Categorical([0, 1, 0, 1, 0, 1])) - v = pd.Series(np.arange(6.)) + v = pd.Series(np.arange(6.0)) actual = category_continuous_interaction(c, v, precondition=False) expected = np.zeros((6, 2)) expected[::2, 0] = v[::2] @@ -378,7 +429,9 @@ def test_interaction_cat_cont_convert(cat, cont): def test_absorbing_regressors(cat, cont, interact, weights): - areg = AbsorbingRegressor(cat=cat, cont=cont, interactions=interact, weights=weights) + areg = AbsorbingRegressor( + cat=cat, cont=cont, interactions=interact, weights=weights + ) rank = areg.approx_rank expected_rank = 0 @@ -393,9 +446,9 @@ def test_absorbing_regressors(cat, cont, interact, weights): interact_mat = inter.sparse expected_rank += interact_mat.shape[1] expected.append(interact_mat) - expected = sp.hstack(expected, format='csc') + expected = sp.hstack(expected, format="csc") if weights is not None: - expected = (sp.diags(np.sqrt(weights)).dot(expected)).asformat('csc') + expected = (sp.diags(np.sqrt(weights)).dot(expected)).asformat("csc") actual = areg.regressors assert expected.shape == actual.shape assert_array_equal(expected.indptr, actual.indptr) @@ -405,7 +458,9 @@ def test_absorbing_regressors(cat, cont, interact, weights): def test_absorbing_regressors_hash(cat, cont, interact, weights): - areg = AbsorbingRegressor(cat=cat, cont=cont, interactions=interact, weights=weights) + areg = AbsorbingRegressor( + cat=cat, cont=cont, interactions=interact, weights=weights + ) # Build hash hashes = [] for col in cat: @@ -429,8 +484,13 @@ def test_empty_absorbing_regressor(): def test_against_ols(ols_data): - mod = AbsorbingLS(ols_data.y, ols_data.x, absorb=ols_data.absorb, - interactions=ols_data.interactions, weights=ols_data.weights) + mod = AbsorbingLS( + ols_data.y, + ols_data.x, + absorb=ols_data.absorb, + interactions=ols_data.interactions, + weights=ols_data.weights, + ) res = mod.fit() absorb = [] has_dummy = False @@ -451,7 +511,7 @@ def test_against_ols(ols_data): else: root_w = np.sqrt(mod.weights.ndarray) wabsorb = annihilate(root_w * absorb, root_w) - absorb = (1. / root_w) * wabsorb + absorb = (1.0 / root_w) * wabsorb rank = np.linalg.matrix_rank(absorb) if rank < absorb.shape[1]: a, b = np.linalg.eig(absorb.T @ absorb) @@ -467,9 +527,11 @@ def test_against_ols(ols_data): def test_cache(): - gen = generate_data(2, True, 2, format='pandas', ncont=0, cont_interactions=1) + gen = generate_data(2, True, 2, format="pandas", ncont=0, cont_interactions=1) first = len(_VARIABLE_CACHE) - mod = AbsorbingLS(gen.y, gen.x, absorb=gen.absorb.iloc[:, :1], interactions=gen.interactions) + mod = AbsorbingLS( + gen.y, gen.x, absorb=gen.absorb.iloc[:, :1], interactions=gen.interactions + ) mod.fit() second = len(_VARIABLE_CACHE) mod = AbsorbingLS(gen.y, gen.x, absorb=gen.absorb, interactions=gen.interactions) @@ -484,18 +546,27 @@ def test_cache(): def test_instrments(): - gen = generate_data(2, True, 2, format='pandas', ncont=0, cont_interactions=1) - mod = AbsorbingLS(gen.y, gen.x, absorb=gen.absorb.iloc[:, :1], interactions=gen.interactions) + gen = generate_data(2, True, 2, format="pandas", ncont=0, cont_interactions=1) + mod = AbsorbingLS( + gen.y, gen.x, absorb=gen.absorb.iloc[:, :1], interactions=gen.interactions + ) assert mod.instruments.shape[1] == 0 def assert_results_equal(o_res: OLSResults, a_res: AbsorbingLSResults, k: int = None): if k is None: k = a_res.params.shape[0] - attrs = [v for v in dir(o_res) if not v.startswith('_')] - callables = ['conf_int'] - skip = ['summary', 'test_linear_constraint', 'predict', 'model', 'f_statistic', 'wald_test', - 'method'] + attrs = [v for v in dir(o_res) if not v.startswith("_")] + callables = ["conf_int"] + skip = [ + "summary", + "test_linear_constraint", + "predict", + "model", + "f_statistic", + "wald_test", + "method", + ] for attr in attrs: if attr in skip: continue @@ -507,9 +578,9 @@ def assert_results_equal(o_res: OLSResults, a_res: AbsorbingLSResults, k: int = if isinstance(left, np.ndarray): raise NotImplementedError elif isinstance(left, pd.DataFrame): - if attr == 'conf_int': + if attr == "conf_int": left = left.iloc[:k] - elif attr == 'cov': + elif attr == "cov": left = left.iloc[:k, :k] assert_allclose(left, right, rtol=2e-4, atol=1e-6) elif isinstance(left, pd.Series): @@ -526,22 +597,22 @@ def assert_results_equal(o_res: OLSResults, a_res: AbsorbingLSResults, k: int = def test_center_cov_arg(): - gen = generate_data(2, True, 2, format='pandas', ncont=0, cont_interactions=1) + gen = generate_data(2, True, 2, format="pandas", ncont=0, cont_interactions=1) mod = AbsorbingLS(gen.y, gen.x, absorb=gen.absorb, interactions=gen.interactions) res = mod.fit(center=True) - assert 'center' not in res.cov_config + assert "center" not in res.cov_config def test_drop_missing(): - gen = generate_data(2, True, 2, format='pandas', ncont=0, cont_interactions=1) + gen = generate_data(2, True, 2, format="pandas", ncont=0, cont_interactions=1) gen.y[::53] = np.nan gen.x[::79] = np.nan with pytest.warns(MissingValueWarning): AbsorbingLS(gen.y, gen.x, absorb=gen.absorb, interactions=gen.interactions) - gen = generate_data(2, True, 2, format='pandas', ncont=0, cont_interactions=1) + gen = generate_data(2, True, 2, format="pandas", ncont=0, cont_interactions=1) for col in gen.absorb: - gen.absorb[col] = gen.absorb[col].astype('int64').astype('object') + gen.absorb[col] = gen.absorb[col].astype("int64").astype("object") col_iloc = gen.absorb.columns.get_loc(col) gen.absorb.iloc[::91, col_iloc] = np.nan gen.absorb[col] = pd.Categorical(to_numpy(gen.absorb[col])) diff --git a/linearmodels/tests/iv/test_against_stata.py b/linearmodels/tests/iv/test_against_stata.py index 96e04d5921..c357db9da2 100644 --- a/linearmodels/tests/iv/test_against_stata.py +++ b/linearmodels/tests/iv/test_against_stata.py @@ -10,46 +10,50 @@ from linearmodels.iv import IV2SLS, IVGMM, IVLIML from linearmodels.tests.iv.results.read_stata_results import process_results -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) CWD = os.path.split(os.path.abspath(__file__))[0] -HOUSING_DATA = pd.read_csv(os.path.join(CWD, 'results', 'housing.csv'), index_col=0) -HOUSING_DATA.region = HOUSING_DATA.region.astype('category') -HOUSING_DATA.state = HOUSING_DATA.state.astype('category') -HOUSING_DATA.division = HOUSING_DATA.division.astype('category') +HOUSING_DATA = pd.read_csv(os.path.join(CWD, "results", "housing.csv"), index_col=0) +HOUSING_DATA.region = HOUSING_DATA.region.astype("category") +HOUSING_DATA.state = HOUSING_DATA.state.astype("category") +HOUSING_DATA.division = HOUSING_DATA.division.astype("category") -SIMULATED_DATA = pd.read_stata(os.path.join(CWD, 'results', 'simulated-data.dta')) +SIMULATED_DATA = pd.read_stata(os.path.join(CWD, "results", "simulated-data.dta")) -filepath = os.path.join(CWD, 'results', 'stata-iv-housing-results.txt') +filepath = os.path.join(CWD, "results", "stata-iv-housing-results.txt") HOUSING_RESULTS = process_results(filepath) -filepath = os.path.join(CWD, 'results', 'stata-iv-simulated-results.txt') +filepath = os.path.join(CWD, "results", "stata-iv-simulated-results.txt") SIMULATED_RESULTS = process_results(filepath) -MODELS = {'2sls': IV2SLS, 'gmm': IVGMM, 'liml': IVLIML} -COV_OPTIONS = {'cluster': {'cov_type': 'clustered', 'clusters': HOUSING_DATA.division}, - 'robust': {'cov_type': 'robust'}, - 'unadjusted': {'cov_type': 'unadjusted'}, - 'bartlett_12': {'cov_type': 'kernel', 'kernel': 'bartlett', 'bandwidth': 12}} +MODELS = {"2sls": IV2SLS, "gmm": IVGMM, "liml": IVLIML} +COV_OPTIONS = { + "cluster": {"cov_type": "clustered", "clusters": HOUSING_DATA.division}, + "robust": {"cov_type": "robust"}, + "unadjusted": {"cov_type": "unadjusted"}, + "bartlett_12": {"cov_type": "kernel", "kernel": "bartlett", "bandwidth": 12}, +} -@pytest.fixture(params=list(HOUSING_RESULTS.keys()), scope='module') +@pytest.fixture(params=list(HOUSING_RESULTS.keys()), scope="module") def housing(request): result = HOUSING_RESULTS[request.param] - keys = request.param.split('-') + keys = request.param.split("-") mod = MODELS[keys[0]] data = HOUSING_DATA endog = data.rent exog = add_constant(data.pcturban) instd = data.hsngval - instr = data[['faminc', 'region']] + instr = data[["faminc", "region"]] cov_opts = deepcopy(COV_OPTIONS[keys[1]]) - cov_opts['debiased'] = keys[2] == 'small' - if keys[0] == 'gmm': + cov_opts["debiased"] = keys[2] == "small" + if keys[0] == "gmm": weight_opts = deepcopy(COV_OPTIONS[keys[1]]) - weight_opts['weight_type'] = weight_opts['cov_type'] - del weight_opts['cov_type'] + weight_opts["weight_type"] = weight_opts["cov_type"] + del weight_opts["cov_type"] else: weight_opts = {} @@ -96,50 +100,59 @@ def test_cov(self, housing): SIMULATED_COV_OPTIONS = { - 'vce(cluster cluster_id)': {'cov_type': 'clustered', 'clusters': SIMULATED_DATA.cluster_id}, - 'vce(robust)': {'cov_type': 'robust'}, - 'vce(unadjusted)': {'cov_type': 'unadjusted'}, - 'vce(hac bartlett 12)': {'cov_type': 'kernel', 'kernel': 'bartlett', 'bandwidth': 12}} + "vce(cluster cluster_id)": { + "cov_type": "clustered", + "clusters": SIMULATED_DATA.cluster_id, + }, + "vce(robust)": {"cov_type": "robust"}, + "vce(unadjusted)": {"cov_type": "unadjusted"}, + "vce(hac bartlett 12)": { + "cov_type": "kernel", + "kernel": "bartlett", + "bandwidth": 12, + }, +} def construct_model(key): - model, nendog, nexog, ninstr, weighted, var, other = key.split('-') - var = var.replace('wmatrix', 'vce') + model, nendog, nexog, ninstr, weighted, var, other = key.split("-") + var = var.replace("wmatrix", "vce") mod = MODELS[model] data = SIMULATED_DATA - endog = data[['x1', 'x2']] if '2' in nendog else data.x1 - exog = data[['x3', 'x4', 'x5']] if '3' in nexog else None - instr = data[['z1', 'z2']] if '2' in ninstr else data.z1 - deps = {'vce(unadjusted)': data.y_unadjusted, - 'vce(robust)': data.y_robust, - 'vce(cluster cluster_id)': data.y_clustered, - 'vce(hac bartlett 12)': data.y_kernel} + endog = data[["x1", "x2"]] if "2" in nendog else data.x1 + exog = data[["x3", "x4", "x5"]] if "3" in nexog else None + instr = data[["z1", "z2"]] if "2" in ninstr else data.z1 + deps = { + "vce(unadjusted)": data.y_unadjusted, + "vce(robust)": data.y_robust, + "vce(cluster cluster_id)": data.y_clustered, + "vce(hac bartlett 12)": data.y_kernel, + } dep = deps[var] - if 'noconstant' not in other: + if "noconstant" not in other: if exog is not None: exog = add_constant(exog) else: exog = add_constant(pd.DataFrame(np.empty((dep.shape[0], 0)))) cov_opts = deepcopy(SIMULATED_COV_OPTIONS[var]) - cov_opts['debiased'] = 'small' in other + cov_opts["debiased"] = "small" in other mod_options = {} - if 'True' in weighted: - mod_options['weights'] = data.weights - if model == 'gmm': + if "True" in weighted: + mod_options["weights"] = data.weights + if model == "gmm": mod_options.update(deepcopy(SIMULATED_COV_OPTIONS[var])) - mod_options['weight_type'] = mod_options['cov_type'] - del mod_options['cov_type'] - mod_options['center'] = 'center' in other + mod_options["weight_type"] = mod_options["cov_type"] + del mod_options["cov_type"] + mod_options["center"] = "center" in other model_result = mod(dep, exog, endog, instr, **mod_options).fit(**cov_opts) - if model == 'gmm' and 'True' in weighted: - pytest.skip('Weighted GMM differs slightly') + if model == "gmm" and "True" in weighted: + pytest.skip("Weighted GMM differs slightly") return model_result -@pytest.fixture(params=list(SIMULATED_RESULTS.keys()), - scope='module') +@pytest.fixture(params=list(SIMULATED_RESULTS.keys()), scope="module") def simulated(request): result = SIMULATED_RESULTS[request.param] model_result = construct_model(request.param) @@ -170,7 +183,7 @@ def test_residual_ss(self, simulated): def test_fstat(self, simulated): res, stata = simulated if stata.f_statistic is None: - pytest.skip('Comparison result not available') + pytest.skip("Comparison result not available") assert_allclose(res.f_statistic.stat, stata.f_statistic) def test_params(self, simulated): @@ -191,7 +204,9 @@ def test_cov(self, simulated): def test_weight_mat(self, simulated): res, stata = simulated - if not hasattr(stata, 'weight_mat') or not isinstance(stata.weight_mat, pd.DataFrame): + if not hasattr(stata, "weight_mat") or not isinstance( + stata.weight_mat, pd.DataFrame + ): return stata_weight_mat = stata.weight_mat.reindex_like(res.weight_matrix) stata_weight_mat = stata_weight_mat[res.weight_matrix.columns] @@ -199,12 +214,12 @@ def test_weight_mat(self, simulated): def test_j_stat(self, simulated): res, stata = simulated - if not hasattr(stata, 'J') or stata.J is None: + if not hasattr(stata, "J") or stata.J is None: return assert_allclose(res.j_stat.stat, stata.J, atol=1e-6, rtol=1e-4) def test_kappa(self, simulated): res, stata = simulated - if not hasattr(stata, 'kappa') or stata.kappa is None: + if not hasattr(stata, "kappa") or stata.kappa is None: return assert_allclose(res.kappa, stata.kappa, rtol=1e-4) diff --git a/linearmodels/tests/iv/test_covariance.py b/linearmodels/tests/iv/test_covariance.py index 494fcb791d..72b46e1c12 100644 --- a/linearmodels/tests/iv/test_covariance.py +++ b/linearmodels/tests/iv/test_covariance.py @@ -16,23 +16,22 @@ from linearmodels.utility import AttrDict -@pytest.fixture(params=['bartlett', 'qs', 'parzen'], scope='module') +@pytest.fixture(params=["bartlett", "qs", "parzen"], scope="module") def kernel(request): kernel_name = request.param - if kernel_name == 'bartlett': + if kernel_name == "bartlett": weight_func = kernel_weight_bartlett - alt_names = ['newey-west'] - elif kernel_name == 'parzen': + alt_names = ["newey-west"] + elif kernel_name == "parzen": weight_func = kernel_weight_parzen - alt_names = ['gallant'] + alt_names = ["gallant"] else: weight_func = kernel_weight_quadratic_spectral - alt_names = ['quadratic-spectral', 'andrews'] - return AttrDict(kernel=kernel_name, alt_names=alt_names, - weight=weight_func) + alt_names = ["quadratic-spectral", "andrews"] + return AttrDict(kernel=kernel_name, alt_names=alt_names, weight=weight_func) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): return generate_data() @@ -87,44 +86,46 @@ def test_asymptotic(self, data): xhat = data.xhat s2 = data.s2 assert c.debiased is False - assert c.config == {'debiased': False, 'kappa': 1} + assert c.config == {"debiased": False, "kappa": 1} assert_allclose(c.s2, data.s2) assert_allclose(c.cov, data.s2 * inv(xhat.T @ xhat / nobs) / nobs) assert_allclose(c.s, s2 * data.v) assert_allclose(c.s, s2 * (xhat.T @ xhat / nobs)) def test_debiased(self, data): - c = HomoskedasticCovariance(data.x, data.y, data.z, data.params, - debiased=True) + c = HomoskedasticCovariance(data.x, data.y, data.z, data.params, debiased=True) assert c.debiased is True - assert c.config == {'debiased': True, 'kappa': 1} + assert c.config == {"debiased": True, "kappa": 1} assert_allclose(c.s2, data.s2_debiased) assert_allclose(c.s, data.s2_debiased * data.v) assert_allclose(c.cov, data.s2_debiased * data.vinv / data.nobs) s = str(c) - assert 'Kappa' not in s - assert 'Debiased: True' in s - assert 'id' in c.__repr__() + assert "Kappa" not in s + assert "Debiased: True" in s + assert "id" in c.__repr__() def test_kappa(self, data): - c = HomoskedasticCovariance(data.x, data.y, data.z, data.params, kappa=data.kappa) + c = HomoskedasticCovariance( + data.x, data.y, data.z, data.params, kappa=data.kappa + ) assert c.debiased is False - assert c.config == {'debiased': False, 'kappa': .99} + assert c.config == {"debiased": False, "kappa": 0.99} assert_allclose(c.s, data.s2 * data.vk) assert_allclose(c.cov, data.s2 * inv(data.vk) / data.nobs) s = str(c) - assert 'Debiased: False' in s - assert 'Kappa' in s + assert "Debiased: False" in s + assert "Kappa" in s def test_kappa_debiased(self, data): - c = HomoskedasticCovariance(data.x, data.y, data.z, data.params, - debiased=True, kappa=data.kappa) + c = HomoskedasticCovariance( + data.x, data.y, data.z, data.params, debiased=True, kappa=data.kappa + ) assert c.debiased is True - assert c.config == {'debiased': True, 'kappa': data.kappa} + assert c.config == {"debiased": True, "kappa": data.kappa} assert_allclose(c.s, data.s2_debiased * data.vk) assert_allclose(c.cov, data.s2_debiased * inv(data.vk) / data.nobs) s = str(c) - assert 'Debiased: True' in s + assert "Debiased: True" in s def test_errors(self, data): with pytest.raises(ValueError): @@ -137,38 +138,42 @@ class TestHeteroskedasticCovariance(object): def test_asymptotic(self, data): c = HeteroskedasticCovariance(data.x, data.y, data.z, data.params) assert c.debiased is False - assert c.config == {'debiased': False, 'kappa': 1} + assert c.config == {"debiased": False, "kappa": 1} assert_allclose(c.s2, data.s2) xhat, eps, nobs = data.xhat, data.e, data.nobs assert_allclose(c.s, (xhat * eps).T @ (xhat * eps) / nobs) def test_debiased(self, data): - c = HeteroskedasticCovariance(data.x, data.y, data.z, data.params, - debiased=True) + c = HeteroskedasticCovariance( + data.x, data.y, data.z, data.params, debiased=True + ) xhat, eps, nobs, nvar = data.xhat, data.e, data.nobs, data.nvar assert c.debiased is True - assert c.config == {'debiased': True, 'kappa': 1} + assert c.config == {"debiased": True, "kappa": 1} s = (xhat * eps).T @ (xhat * eps) / (nobs - nvar) assert_allclose(c.s, s) assert_allclose(c.cov, data.vinv @ s @ data.vinv / nobs) def test_kappa_debiased(self, data): - c = HeteroskedasticCovariance(data.x, data.y, data.z, data.params, - debiased=True, kappa=.99) + c = HeteroskedasticCovariance( + data.x, data.y, data.z, data.params, debiased=True, kappa=0.99 + ) assert c.debiased is True - assert c.config == {'debiased': True, 'kappa': 0.99} - c2 = HeteroskedasticCovariance(data.x, data.y, data.z, data.params, - debiased=True) + assert c.config == {"debiased": True, "kappa": 0.99} + c2 = HeteroskedasticCovariance( + data.x, data.y, data.z, data.params, debiased=True + ) assert_allclose(c.s, c2.s) assert c.s2 == c2.s2 vk_inv = inv(data.vk) assert_allclose(c.cov, vk_inv @ c.s @ vk_inv / data.nobs) def test_kappa(self, data): - c = HeteroskedasticCovariance(data.x, data.y, data.z, data.params, - debiased=False, kappa=.99) + c = HeteroskedasticCovariance( + data.x, data.y, data.z, data.params, debiased=False, kappa=0.99 + ) assert c.debiased is False - assert c.config == {'debiased': False, 'kappa': 0.99} + assert c.config == {"debiased": False, "kappa": 0.99} c2 = HeteroskedasticCovariance(data.x, data.y, data.z, data.params) assert_allclose(c.s, c2.s) assert c.s2 == c2.s2 @@ -178,12 +183,13 @@ def test_kappa(self, data): class TestClusteredCovariance(object): def test_asymptotic(self, data): - c = ClusteredCovariance(data.x, data.y, data.z, data.params, - clusters=data.clusters) + c = ClusteredCovariance( + data.x, data.y, data.z, data.params, clusters=data.clusters + ) assert c._kappa == 1 assert c.debiased is False - assert c.config['debiased'] is False - assert_equal(c.config['clusters'], data.clusters) + assert c.config["debiased"] is False + assert_equal(c.config["clusters"], data.clusters) assert_allclose(c.s2, data.s2) sums = np.zeros((len(np.unique(data.clusters)), data.nvar)) xe = data.xhat * data.e @@ -196,15 +202,16 @@ def test_asymptotic(self, data): assert_allclose(c.s, s) assert_allclose(c.cov, data.vinv @ s @ data.vinv / data.nobs) cs = str(c) - assert 'Debiased: False' in cs - assert 'Num Clusters: {0}'.format(len(sums)) in cs + assert "Debiased: False" in cs + assert "Num Clusters: {0}".format(len(sums)) in cs def test_debiased(self, data): - c = ClusteredCovariance(data.x, data.y, data.z, data.params, - debiased=True, clusters=data.clusters) + c = ClusteredCovariance( + data.x, data.y, data.z, data.params, debiased=True, clusters=data.clusters + ) assert c.debiased is True - assert c.config['debiased'] is True - assert_equal(c.config['clusters'], data.clusters) + assert c.config["debiased"] is True + assert_equal(c.config["clusters"], data.clusters) ngroups = len(np.unique(data.clusters)) sums = np.zeros((ngroups, data.nvar)) @@ -215,38 +222,43 @@ def test_debiased(self, data): for j in range(len(sums)): op += sums[[j]].T @ sums[[j]] # This is a strange choice - s = op / data.nobs * ((data.nobs - 1) / (data.nobs - data.nvar)) * ngroups / (ngroups - 1) + s = ( + op + / data.nobs + * ((data.nobs - 1) / (data.nobs - data.nvar)) + * ngroups + / (ngroups - 1) + ) assert_allclose(c.s, s) assert_allclose(c.cov, data.vinv @ s @ data.vinv / data.nobs) cs = str(c) - assert 'Debiased: True' in cs - assert 'Num Clusters: {0}'.format(len(sums)) in cs - assert 'id' in c.__repr__() + assert "Debiased: True" in cs + assert "Num Clusters: {0}".format(len(sums)) in cs + assert "id" in c.__repr__() def test_errors(self, data): with pytest.raises(ValueError): - ClusteredCovariance(data.x, data.y, data.z, data.params, - clusters=data.clusters[:10]) + ClusteredCovariance( + data.x, data.y, data.z, data.params, clusters=data.clusters[:10] + ) class TestKernelCovariance(object): def test_asymptotic(self, data, kernel): - c = KernelCovariance(data.x, data.y, data.z, data.params, - kernel=kernel.kernel) + c = KernelCovariance(data.x, data.y, data.z, data.params, kernel=kernel.kernel) cs = str(c) - assert '\nBandwidth' not in cs + assert "\nBandwidth" not in cs for name in kernel.alt_names: - c2 = KernelCovariance(data.x, data.y, data.z, data.params, - kernel=name) + c2 = KernelCovariance(data.x, data.y, data.z, data.params, kernel=name) assert_equal(c.cov, c2.cov) assert c.debiased is False - assert c.config['debiased'] is False - assert_equal(c.config['kernel'], kernel.kernel) + assert c.config["debiased"] is False + assert_equal(c.config["kernel"], kernel.kernel) assert_allclose(c.s2, data.s2) - bw = c.config['bandwidth'] + bw = c.config["bandwidth"] xe = data.xhat * data.e s = xe.T @ xe w = kernel.weight(bw, xe.shape[0] - 1) @@ -257,39 +269,43 @@ def test_asymptotic(self, data, kernel): assert_allclose(c.cov, data.vinv @ s @ data.vinv / data.nobs) cs = str(c) - assert 'Kappa' not in cs - assert 'Kernel: {0}'.format(kernel.kernel) in cs - assert 'Bandwidth: {0}'.format(bw) in cs + assert "Kappa" not in cs + assert "Kernel: {0}".format(kernel.kernel) in cs + assert "Bandwidth: {0}".format(bw) in cs def test_debiased(self, data, kernel): - c = KernelCovariance(data.x, data.y, data.z, data.params, - kernel=kernel.kernel, debiased=True) + c = KernelCovariance( + data.x, data.y, data.z, data.params, kernel=kernel.kernel, debiased=True + ) for name in kernel.alt_names: - c2 = KernelCovariance(data.x, data.y, data.z, data.params, - kernel=name, debiased=True) + c2 = KernelCovariance( + data.x, data.y, data.z, data.params, kernel=name, debiased=True + ) assert_equal(c.cov, c2.cov) assert c._kappa == 1 assert c.debiased is True - assert c.config['debiased'] is True - assert_equal(c.config['kernel'], kernel.kernel) + assert c.config["debiased"] is True + assert_equal(c.config["kernel"], kernel.kernel) assert_allclose(c.s2, data.s2_debiased) - c2 = KernelCovariance(data.x, data.y, data.z, data.params, - kernel=kernel.kernel, debiased=False) + c2 = KernelCovariance( + data.x, data.y, data.z, data.params, kernel=kernel.kernel, debiased=False + ) scale = data.nobs / (data.nobs - data.nvar) assert_allclose(c.s, scale * c2.s) assert_allclose(c.cov, scale * c2.cov) cs = str(c) - assert 'Debiased: True' in cs - assert 'Kernel: {0}'.format(kernel.kernel) in cs - assert 'Bandwidth: {0}'.format(c.config['bandwidth']) in cs - assert 'id' in c.__repr__() + assert "Debiased: True" in cs + assert "Kernel: {0}".format(kernel.kernel) in cs + assert "Bandwidth: {0}".format(c.config["bandwidth"]) in cs + assert "id" in c.__repr__() def test_unknown_kernel(self, data, kernel): with pytest.raises(ValueError): - KernelCovariance(data.x, data.y, data.z, data.params, - kernel=kernel.kernel + '_unknown') + KernelCovariance( + data.x, data.y, data.z, data.params, kernel=kernel.kernel + "_unknown" + ) class TestAutomaticBandwidth(object): @@ -303,4 +319,4 @@ def test_smoke(self, data, kernel): def test_unknown_kernel(self, data, kernel): with pytest.raises(ValueError): - kernel_optimal_bandwidth(data.e, kernel.kernel + '_unknown') + kernel_optimal_bandwidth(data.e, kernel.kernel + "_unknown") diff --git a/linearmodels/tests/iv/test_data.py b/linearmodels/tests/iv/test_data.py index 27c74fc76e..c80e335b1e 100644 --- a/linearmodels/tests/iv/test_data.py +++ b/linearmodels/tests/iv/test_data.py @@ -21,7 +21,7 @@ def test_numpy_2d(self): x = np.empty((10, 2)) xdh = IVData(x) assert xdh.ndim == x.ndim - assert xdh.cols == ['x.0', 'x.1'] + assert xdh.cols == ["x.0", "x.1"] assert xdh.rows == list(np.arange(10)) assert_equal(xdh.ndarray, x) df = pd.DataFrame(x, columns=xdh.cols, index=xdh.rows) @@ -33,7 +33,7 @@ def test_numpy_1d(self): x = np.empty(10) xdh = IVData(x) assert xdh.ndim == 2 - assert xdh.cols == ['x'] + assert xdh.cols == ["x"] assert xdh.rows == list(np.arange(10)) assert_equal(xdh.ndarray, x[:, None]) df = pd.DataFrame(x[:, None], columns=xdh.cols, index=xdh.rows) @@ -42,8 +42,8 @@ def test_numpy_1d(self): def test_pandas_df_numeric(self): x = np.empty((10, 2)) - index = pd.date_range('2017-01-01', periods=10) - xdf = pd.DataFrame(x, columns=['a', 'b'], index=index) + index = pd.date_range("2017-01-01", periods=10) + xdf = pd.DataFrame(x, columns=["a", "b"], index=index) xdh = IVData(xdf) assert xdh.ndim == 2 assert xdh.cols == list(xdf.columns) @@ -55,8 +55,8 @@ def test_pandas_df_numeric(self): def test_pandas_series_numeric(self): x = np.empty(10) - index = pd.date_range('2017-01-01', periods=10) - xs = pd.Series(x, name='charlie', index=index) + index = pd.date_range("2017-01-01", periods=10) + xs = pd.Series(x, name="charlie", index=index) xdh = IVData(xs) assert xdh.ndim == 2 assert xdh.cols == [xs.name] @@ -66,46 +66,43 @@ def test_pandas_series_numeric(self): assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 1) - @pytest.mark.skipif(MISSING_XARRAY, reason='xarray not installed') + @pytest.mark.skipif(MISSING_XARRAY, reason="xarray not installed") def test_xarray_1d(self): x_np = np.random.randn(10) x = xr.DataArray(x_np) - dh = IVData(x, 'some_variable') + dh = IVData(x, "some_variable") assert_equal(dh.ndarray, x_np[:, None]) assert dh.rows == list(np.arange(10)) - assert dh.cols == ['some_variable.0'] + assert dh.cols == ["some_variable.0"] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) - index = pd.date_range('2017-01-01', periods=10) - x = xr.DataArray(x_np, - [('time', index)]) - dh = IVData(x, 'some_variable') + index = pd.date_range("2017-01-01", periods=10) + x = xr.DataArray(x_np, [("time", index)]) + dh = IVData(x, "some_variable") assert_equal(dh.ndarray, x_np[:, None]) assert_series_equal(pd.Series(dh.rows), pd.Series(list(index))) - assert dh.cols == ['some_variable.0'] + assert dh.cols == ["some_variable.0"] expected = pd.DataFrame(x_np[:, None], columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) - @pytest.mark.skipif(MISSING_XARRAY, reason='xarray not installed') + @pytest.mark.skipif(MISSING_XARRAY, reason="xarray not installed") def test_xarray_2d(self): x_np = np.random.randn(10, 2) x = xr.DataArray(x_np) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert dh.rows == list(np.arange(10)) - assert dh.cols == ['x.0', 'x.1'] + assert dh.cols == ["x.0", "x.1"] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) - index = pd.date_range('2017-01-01', periods=10) - x = xr.DataArray(x_np, - [('time', index), - ('variables', ['apple', 'banana'])]) + index = pd.date_range("2017-01-01", periods=10) + x = xr.DataArray(x_np, [("time", index), ("variables", ["apple", "banana"])]) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert_series_equal(pd.Series(dh.rows), pd.Series(list(index))) - assert dh.cols == ['apple', 'banana'] + assert dh.cols == ["apple", "banana"] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) @@ -115,6 +112,7 @@ def test_invalid_types(self): with pytest.raises(ValueError): IVData(np.empty((10, 2, 2))) with pytest.raises(TypeError): + class AnotherClass(object): @property def ndim(self): @@ -123,21 +121,22 @@ def ndim(self): IVData(AnotherClass()) def test_string_cat_equiv(self): - s1 = pd.Series(['a', 'b', 'a', 'b', 'c', 'd', 'a', 'b']) + s1 = pd.Series(["a", "b", "a", "b", "c", "d", "a", "b"]) s2 = pd.Series(np.arange(8.0)) - s3 = pd.Series(['apple', 'banana', 'apple', 'banana', - 'cherry', 'date', 'apple', 'banana']) - df = pd.DataFrame({'string': s1, 'number': s2, 'other_string': s3}) + s3 = pd.Series( + ["apple", "banana", "apple", "banana", "cherry", "date", "apple", "banana"] + ) + df = pd.DataFrame({"string": s1, "number": s2, "other_string": s3}) dh = IVData(df) df_cat = df.copy() - df_cat['string'] = df_cat['string'].astype('category') + df_cat["string"] = df_cat["string"].astype("category") dh_cat = IVData(df_cat) assert_frame_equal(dh.pandas, dh_cat.pandas) def test_existing_datahandler(self): x = np.empty((10, 2)) - index = pd.date_range('2017-01-01', periods=10) - xdf = pd.DataFrame(x, columns=['a', 'b'], index=index) + index = pd.date_range("2017-01-01", periods=10) + xdf = pd.DataFrame(x, columns=["a", "b"], index=index) xdh = IVData(xdf) xdh2 = IVData(xdh) assert xdh is not xdh2 @@ -148,57 +147,57 @@ def test_existing_datahandler(self): assert_frame_equal(xdh.pandas, xdh2.pandas) def test_categorical(self): - index = pd.date_range('2017-01-01', periods=10) - cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) + index = pd.date_range("2017-01-01", periods=10) + cat = pd.Categorical(["a", "b", "a", "b", "a", "a", "b", "c", "c", "a"]) num = np.empty(10) df = pd.DataFrame(OrderedDict(cat=cat, num=num), index=index) dh = IVData(df) assert dh.ndim == 2 assert dh.shape == (10, 3) - assert sorted(dh.cols) == sorted(['cat.b', 'cat.c', 'num']) + assert sorted(dh.cols) == sorted(["cat.b", "cat.c", "num"]) assert dh.rows == list(index) - assert_equal(dh.pandas['num'].values, num) - assert_equal(dh.pandas['cat.b'].values, (cat == 'b').astype(np.float)) - assert_equal(dh.pandas['cat.c'].values, (cat == 'c').astype(np.float)) + assert_equal(dh.pandas["num"].values, num) + assert_equal(dh.pandas["cat.b"].values, (cat == "b").astype(np.float)) + assert_equal(dh.pandas["cat.c"].values, (cat == "c").astype(np.float)) def test_categorical_series(self): - index = pd.date_range('2017-01-01', periods=10) - cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) - s = pd.Series(cat, name='cat', index=index) + index = pd.date_range("2017-01-01", periods=10) + cat = pd.Categorical(["a", "b", "a", "b", "a", "a", "b", "c", "c", "a"]) + s = pd.Series(cat, name="cat", index=index) dh = IVData(s) assert dh.ndim == 2 assert dh.shape == (10, 2) - assert sorted(dh.cols) == sorted(['cat.b', 'cat.c']) + assert sorted(dh.cols) == sorted(["cat.b", "cat.c"]) assert dh.rows == list(index) - assert_equal(dh.pandas['cat.b'].values, (cat == 'b').astype(np.float)) - assert_equal(dh.pandas['cat.c'].values, (cat == 'c').astype(np.float)) + assert_equal(dh.pandas["cat.b"].values, (cat == "b").astype(np.float)) + assert_equal(dh.pandas["cat.c"].values, (cat == "c").astype(np.float)) def test_categorical_no_conversion(self): - index = pd.date_range('2017-01-01', periods=10) - cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) - s = pd.Series(cat, index=index, name='cat') + index = pd.date_range("2017-01-01", periods=10) + cat = pd.Categorical(["a", "b", "a", "b", "a", "a", "b", "c", "c", "a"]) + s = pd.Series(cat, index=index, name="cat") dh = IVData(s, convert_dummies=False) assert dh.ndim == 2 assert dh.shape == (10, 1) - assert dh.cols == ['cat'] + assert dh.cols == ["cat"] assert dh.rows == list(index) df = pd.DataFrame(s) assert_frame_equal(dh.pandas, df) def test_categorical_keep_first(self): - index = pd.date_range('2017-01-01', periods=10) - cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) + index = pd.date_range("2017-01-01", periods=10) + cat = pd.Categorical(["a", "b", "a", "b", "a", "a", "b", "c", "c", "a"]) num = np.empty(10) df = pd.DataFrame(OrderedDict(cat=cat, num=num), index=index) dh = IVData(df, drop_first=False) assert dh.ndim == 2 assert dh.shape == (10, 4) - assert sorted(dh.cols) == sorted(['cat.a', 'cat.b', 'cat.c', 'num']) + assert sorted(dh.cols) == sorted(["cat.a", "cat.b", "cat.c", "num"]) assert dh.rows == list(index) - assert_equal(dh.pandas['num'].values, num) - assert_equal(dh.pandas['cat.a'].values, (cat == 'a').astype(np.float)) - assert_equal(dh.pandas['cat.b'].values, (cat == 'b').astype(np.float)) - assert_equal(dh.pandas['cat.c'].values, (cat == 'c').astype(np.float)) + assert_equal(dh.pandas["num"].values, num) + assert_equal(dh.pandas["cat.a"].values, (cat == "a").astype(np.float)) + assert_equal(dh.pandas["cat.b"].values, (cat == "b").astype(np.float)) + assert_equal(dh.pandas["cat.c"].values, (cat == "c").astype(np.float)) def test_nobs_missing_error(self): with pytest.raises(ValueError): @@ -210,12 +209,12 @@ def test_incorrect_nobs(self): IVData(x, nobs=100) def test_mixed_data(self): - s = pd.Series([1, 2, 'a', -3.0]) + s = pd.Series([1, 2, "a", -3.0]) with pytest.raises(ValueError): IVData(s) def test_duplicate_column_names(): - x = pd.DataFrame(np.ones((3, 2)), columns=['x', 'x']) + x = pd.DataFrame(np.ones((3, 2)), columns=["x", "x"]) with pytest.raises(ValueError): IVData(x) diff --git a/linearmodels/tests/iv/test_formulas.py b/linearmodels/tests/iv/test_formulas.py index 431909d20f..010ac36979 100644 --- a/linearmodels/tests/iv/test_formulas.py +++ b/linearmodels/tests/iv/test_formulas.py @@ -10,9 +10,12 @@ from linearmodels.iv import IV2SLS, IVGMM, IVGMMCUE, IVLIML -@pytest.fixture(scope='module', - params=list(zip([IV2SLS, IVLIML, IVGMMCUE, IVGMM], - [iv_2sls, iv_liml, iv_gmm_cue, iv_gmm]))) +@pytest.fixture( + scope="module", + params=list( + zip([IV2SLS, IVLIML, IVGMMCUE, IVGMM], [iv_2sls, iv_liml, iv_gmm_cue, iv_gmm]) + ), +) def model_and_func(request): return request.param @@ -21,16 +24,18 @@ def sigmoid(v): return np.exp(v) / (1 + np.exp(v)) -formulas = ['y ~ 1 + x3 + x4 + x5 + [x1 + x2 ~ z1 + z2 + z3]', - 'y ~ 1 + x3 + x4 + [x1 + x2 ~ z1 + z2 + z3] + x5'] +formulas = [ + "y ~ 1 + x3 + x4 + x5 + [x1 + x2 ~ z1 + z2 + z3]", + "y ~ 1 + x3 + x4 + [x1 + x2 ~ z1 + z2 + z3] + x5", +] -@pytest.fixture(scope='module', params=formulas) +@pytest.fixture(scope="module", params=formulas) def formula(request): return request.param -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): n, k, p = 1000, 5, 3 np.random.seed(12345) @@ -44,16 +49,16 @@ def data(): v = np.random.multivariate_normal(np.zeros(r.shape[0]), r, n) x = v[:, :k] - z = v[:, k:k + p] + z = v[:, k : k + p] e = v[:, [-1]] params = np.arange(1, k + 1) / k params = params[:, None] y = x @ params + e - cols = ['y'] + ['x' + str(i) for i in range(1, 6)] - cols += ['z' + str(i) for i in range(1, 4)] + cols = ["y"] + ["x" + str(i) for i in range(1, 6)] + cols += ["z" + str(i) for i in range(1, 4)] data = DataFrame(np.c_[y, x, z], columns=cols) - data['Intercept'] = 1.0 - data['weights'] = np.random.chisquare(10, size=data.shape[0]) / 10 + data["Intercept"] = 1.0 + data["weights"] = np.random.chisquare(10, size=data.shape[0]) / 10 return data @@ -61,9 +66,9 @@ def test_formula(data, model_and_func, formula): model, func = model_and_func mod = model.from_formula(formula, data) res = mod.fit() - exog = data[['Intercept', 'x3', 'x4', 'x5']] - endog = data[['x1', 'x2']] - instr = data[['z1', 'z2', 'z3']] + exog = data[["Intercept", "x3", "x4", "x5"]] + endog = data[["x1", "x2"]] + instr = data[["z1", "z2", "z3"]] res2 = model(data.y, exog, endog, instr).fit() assert res.rsquared == res2.rsquared assert mod.formula == formula @@ -78,9 +83,9 @@ def test_formula_weights(data, model_and_func, formula): model, func = model_and_func mod = model.from_formula(formula, data, weights=data.weights) res = mod.fit() - exog = data[['Intercept', 'x3', 'x4', 'x5']] - endog = data[['x1', 'x2']] - instr = data[['z1', 'z2', 'z3']] + exog = data[["Intercept", "x3", "x4", "x5"]] + endog = data[["x1", "x2"]] + instr = data[["z1", "z2", "z3"]] res2 = model(data.y, exog, endog, instr, weights=data.weights).fit() assert res.rsquared == res2.rsquared assert mod.formula == formula @@ -94,14 +99,14 @@ def test_formula_weights(data, model_and_func, formula): def test_formula_kernel(data, model_and_func, formula): model, func = model_and_func mod = model.from_formula(formula, data) - mod.fit(cov_type='kernel') - func(formula, data).fit(cov_type='kernel') + mod.fit(cov_type="kernel") + func(formula, data).fit(cov_type="kernel") def test_formula_ols(data, model_and_func): model, func = model_and_func - formula = 'y ~ 1 + x1 + x2 + x3 + x4 + x5' - exog = data[['Intercept', 'x1', 'x2', 'x3', 'x4', 'x5']] + formula = "y ~ 1 + x1 + x2 + x3 + x4 + x5" + exog = data[["Intercept", "x1", "x2", "x3", "x4", "x5"]] res2 = model(data.y, exog, None, None) res2 = res2.fit() res = model.from_formula(formula, data).fit() @@ -113,8 +118,8 @@ def test_formula_ols(data, model_and_func): def test_formula_ols_weights(data, model_and_func): model, func = model_and_func - formula = 'y ~ 1 + x1 + x2 + x3 + x4 + x5' - exog = data[['Intercept', 'x1', 'x2', 'x3', 'x4', 'x5']] + formula = "y ~ 1 + x1 + x2 + x3 + x4 + x5" + exog = data[["Intercept", "x1", "x2", "x3", "x4", "x5"]] res2 = model(data.y, exog, None, None, weights=data.weights) res2 = res2.fit() res = model.from_formula(formula, data, weights=data.weights).fit() @@ -126,7 +131,7 @@ def test_formula_ols_weights(data, model_and_func): def test_no_exog(data, model_and_func): model, func = model_and_func - formula = 'y ~ [x1 + x2 ~ z1 + z2 + z3]' + formula = "y ~ [x1 + x2 ~ z1 + z2 + z3]" mod = model.from_formula(formula, data) res = mod.fit() res2 = func(formula, data).fit() @@ -134,7 +139,7 @@ def test_no_exog(data, model_and_func): assert res.rsquared == res2.rsquared assert mod.formula == formula - mod2 = model(data.y, None, data[['x1', 'x2']], data[['z1', 'z2', 'z3']]) + mod2 = model(data.y, None, data[["x1", "x2"]], data[["z1", "z2", "z3"]]) res3 = mod2.fit() assert_allclose(res.rsquared, res3.rsquared) @@ -142,38 +147,38 @@ def test_no_exog(data, model_and_func): def test_invalid_formula(data, model_and_func): model, func = model_and_func - formula = 'y ~ 1 + x1 + x2 ~ x3 + [x4 x5 ~ z1 z2]' + formula = "y ~ 1 + x1 + x2 ~ x3 + [x4 x5 ~ z1 z2]" with pytest.raises(ValueError): model.from_formula(formula, data).fit() with pytest.raises(ValueError): func(formula, data).fit() - formula = 'y ~ 1 + x1 + x2 + x3 + x4 + x5 ~ z1 z2' + formula = "y ~ 1 + x1 + x2 + x3 + x4 + x5 ~ z1 z2" with pytest.raises(ValueError): model.from_formula(formula, data).fit() - formula = 'y y2 ~ 1 + x1 + x2 + x3 + [x4 + x5 ~ + z1 + z2]' + formula = "y y2 ~ 1 + x1 + x2 + x3 + [x4 + x5 ~ + z1 + z2]" with pytest.raises(ValueError): model.from_formula(formula, data).fit() - formula = 'y y2 ~ 1 + x1 + x2 + x3 [ + x4 + x5 ~ z1 + z2]' + formula = "y y2 ~ 1 + x1 + x2 + x3 [ + x4 + x5 ~ z1 + z2]" with pytest.raises(ValueError): model.from_formula(formula, data).fit() - formula = 'y y2 ~ 1 + x1 + x2 + x3 + [x4 + x5 ~ z1 + z2]' + formula = "y y2 ~ 1 + x1 + x2 + x3 + [x4 + x5 ~ z1 + z2]" with pytest.raises(SyntaxError): model.from_formula(formula, data).fit() def test_categorical(model_and_func): - formula = 'y ~ 1 + d + x1' + formula = "y ~ 1 + d + x1" y = np.random.randn(1000) x1 = np.random.randn(1000) d = np.random.randint(0, 4, 1000) d = Categorical(d) - data = DataFrame({'y': y, 'x1': x1, 'd': d}) - data['Intercept'] = 1.0 + data = DataFrame({"y": y, "x1": x1, "d": d}) + data["Intercept"] = 1.0 model, func = model_and_func mod = model.from_formula(formula, data) res3 = mod.fit() res2 = func(formula, data).fit() - res = model(data.y, data[['Intercept', 'x1', 'd']], None, None).fit() + res = model(data.y, data[["Intercept", "x1", "d"]], None, None).fit() assert_allclose(res.rsquared, res2.rsquared) assert_allclose(res2.rsquared, res3.rsquared) @@ -184,8 +189,8 @@ def test_predict_formula(data, model_and_func, formula): model, func = model_and_func mod = model.from_formula(formula, data) res = mod.fit() - exog = data[['Intercept', 'x3', 'x4', 'x5']] - endog = data[['x1', 'x2']] + exog = data[["Intercept", "x3", "x4", "x5"]] + endog = data[["x1", "x2"]] pred = res.predict(exog, endog) pred2 = res.predict(data=data) assert_frame_equal(pred, pred2) @@ -194,16 +199,20 @@ def test_predict_formula(data, model_and_func, formula): def test_formula_function(data, model_and_func): model, func = model_and_func - fmla = 'y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)' + fmla = "y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)" mod = model.from_formula(fmla, data) res = mod.fit() dep = data.y - exog = [data[['Intercept']], sigmoid(data[['x3']]), data[['x4']], - np.exp(data[['x5']])] + exog = [ + data[["Intercept"]], + sigmoid(data[["x3"]]), + data[["x4"]], + np.exp(data[["x5"]]), + ] exog = concat(exog, 1) - endog = data[['x1', 'x2']] - instr = data[['z1', 'z2', 'z3']] + endog = data[["x1", "x2"]] + instr = data[["z1", "z2", "z3"]] mod = model(dep, exog, endog, instr) res2 = mod.fit() assert_equal(res.params.values, res2.params.values) @@ -216,14 +225,18 @@ def test_formula_function(data, model_and_func): def test_predict_formula_function(data, model_and_func): model, func = model_and_func - fmla = 'y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)' + fmla = "y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)" mod = model.from_formula(fmla, data) res = mod.fit() - exog = [data[['Intercept']], sigmoid(data[['x3']]), data[['x4']], - np.exp(data[['x5']])] + exog = [ + data[["Intercept"]], + sigmoid(data[["x3"]]), + data[["x4"]], + np.exp(data[["x5"]]), + ] exog = concat(exog, 1) - endog = data[['x1', 'x2']] + endog = data[["x1", "x2"]] pred = res.predict(exog, endog) pred2 = res.predict(data=data) assert_frame_equal(pred, pred2) @@ -240,8 +253,8 @@ def test_predict_formula_error(data, model_and_func, formula): model, func = model_and_func mod = model.from_formula(formula, data) res = mod.fit() - exog = data[['Intercept', 'x3', 'x4', 'x5']] - endog = data[['x1', 'x2']] + exog = data[["Intercept", "x3", "x4", "x5"]] + endog = data[["x1", "x2"]] with pytest.raises(ValueError): res.predict(exog, endog, data=data) with pytest.raises(ValueError): @@ -251,21 +264,21 @@ def test_predict_formula_error(data, model_and_func, formula): def test_single_character_names(data, model_and_func): # GH 149 data = data.copy() - data['x'] = data['x1'] - data['v'] = data['x2'] - data['z'] = data['z1'] - data['a'] = data['z2'] - fmla = 'y ~ 1 + [x ~ z]' + data["x"] = data["x1"] + data["v"] = data["x2"] + data["z"] = data["z1"] + data["a"] = data["z2"] + fmla = "y ~ 1 + [x ~ z]" model, func = model_and_func mod = model.from_formula(fmla, data) mod.fit() - fmla = 'y ~ 1 + [x ~ z + a]' + fmla = "y ~ 1 + [x ~ z + a]" model, func = model_and_func mod = model.from_formula(fmla, data) mod.fit() - fmla = 'y ~ 1 + [x + v ~ z + a]' + fmla = "y ~ 1 + [x + v ~ z + a]" model, func = model_and_func mod = model.from_formula(fmla, data) mod.fit() @@ -274,7 +287,7 @@ def test_single_character_names(data, model_and_func): def test_ols_formula(data): # GH 185 data = data.copy() - fmla = 'y ~ 1 + x1' + fmla = "y ~ 1 + x1" mod = IV2SLS.from_formula(fmla, data) res = mod.fit() - assert 'OLS Estimation Summary' in str(res) + assert "OLS Estimation Summary" in str(res) diff --git a/linearmodels/tests/iv/test_gmm.py b/linearmodels/tests/iv/test_gmm.py index 82b81c53f2..18e82b21f0 100644 --- a/linearmodels/tests/iv/test_gmm.py +++ b/linearmodels/tests/iv/test_gmm.py @@ -13,28 +13,27 @@ from linearmodels.utility import AttrDict -@pytest.fixture(params=[None, 12], scope='module') +@pytest.fixture(params=[None, 12], scope="module") def bandwidth(request): return request.param -@pytest.fixture(params=['bartlett', 'qs', 'parzen'], scope='module') +@pytest.fixture(params=["bartlett", "qs", "parzen"], scope="module") def kernel(request): kernel_name = request.param - if kernel_name == 'bartlett': + if kernel_name == "bartlett": weight_func = kernel_weight_bartlett - alt_names = ['newey-west'] - elif kernel_name == 'parzen': + alt_names = ["newey-west"] + elif kernel_name == "parzen": weight_func = kernel_weight_parzen - alt_names = ['gallant'] + alt_names = ["gallant"] else: weight_func = kernel_weight_quadratic_spectral - alt_names = ['quadratic-spectral', 'andrews'] - return AttrDict(kernel=kernel_name, alt_names=alt_names, - weight=weight_func) + alt_names = ["quadratic-spectral", "andrews"] + return AttrDict(kernel=kernel_name, alt_names=alt_names, weight=weight_func) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): return generate_data() @@ -68,8 +67,8 @@ def test_config(self, data): weight = wm.weight_matrix(data.x, z, e) s2 = (e - e.mean()).T @ (e - e.mean()) / nobs assert_allclose(weight, s2 * z.T @ z / nobs) - assert wm.config['center'] is False - assert wm.config['debiased'] is False + assert wm.config["center"] is False + assert wm.config["debiased"] is False class TestHeteroskedasticWeight(object): @@ -96,14 +95,13 @@ def test_config(self, data): ze = z * e assert_allclose(weight, ze.T @ ze / nobs) - assert wm.config['center'] is False - assert wm.config['debiased'] is False + assert wm.config["center"] is False + assert wm.config["debiased"] is False class TestKernelWeight(object): def test_center(self, data, kernel, bandwidth): - wm = KernelWeightMatrix(kernel.kernel, bandwidth, center=True, - optimal_bw=True) + wm = KernelWeightMatrix(kernel.kernel, bandwidth, center=True, optimal_bw=True) weight = wm.weight_matrix(data.x, data.z, data.e) z, e, nobs = data.z, data.e, data.nobs bw = bandwidth or wm.bandwidth @@ -115,15 +113,17 @@ def test_center(self, data, kernel, bandwidth): op = ze[i:].T @ ze[:-i] s += w[i] * (op + op.T) assert_allclose(weight, s / nobs) - assert wm.config['bandwidth'] == bw - assert wm.config['kernel'] == kernel.kernel + assert wm.config["bandwidth"] == bw + assert wm.config["kernel"] == kernel.kernel for name in kernel.alt_names: wm = KernelWeightMatrix(name, bandwidth, center=True, optimal_bw=True) weight2 = wm.weight_matrix(data.x, data.z, data.e) assert_equal(weight, weight2) def test_debiased(self, kernel, data, bandwidth): - wm = KernelWeightMatrix(debiased=True, kernel=kernel.kernel, bandwidth=bandwidth) + wm = KernelWeightMatrix( + debiased=True, kernel=kernel.kernel, bandwidth=bandwidth + ) weight = wm.weight_matrix(data.x, data.z, data.e) z, e, nobs, nvar = data.z, data.e, data.nobs, data.nvar bw = bandwidth or wm.bandwidth @@ -134,8 +134,8 @@ def test_debiased(self, kernel, data, bandwidth): op = ze[i:].T @ ze[:-i] s += w[i] * (op + op.T) assert_allclose(weight, s / (nobs - nvar)) - assert wm.config['bandwidth'] == bw - assert wm.config['kernel'] == kernel.kernel + assert wm.config["bandwidth"] == bw + assert wm.config["kernel"] == kernel.kernel def test_config(self, data, kernel, bandwidth): wm = KernelWeightMatrix(kernel=kernel.kernel, bandwidth=bandwidth) @@ -149,10 +149,10 @@ def test_config(self, data, kernel, bandwidth): op = ze[i:].T @ ze[:-i] s += w[i] * (op + op.T) assert_allclose(weight, s / nobs) - assert wm.config['center'] is False - assert wm.config['debiased'] is False - assert wm.config['bandwidth'] == bw - assert wm.config['kernel'] == kernel.kernel + assert wm.config["center"] is False + assert wm.config["debiased"] is False + assert wm.config["bandwidth"] == bw + assert wm.config["kernel"] == kernel.kernel for name in kernel.alt_names: wm = KernelWeightMatrix(kernel=name, bandwidth=bandwidth) @@ -191,9 +191,9 @@ def test_debiased(self, data): def test_config(self, data): wm = OneWayClusteredWeightMatrix(data.clusters) - assert wm.config['center'] is False - assert wm.config['debiased'] is False - assert_equal(wm.config['clusters'], data.clusters) + assert wm.config["center"] is False + assert wm.config["debiased"] is False + assert_equal(wm.config["clusters"], data.clusters) def test_errors(self, data): wm = OneWayClusteredWeightMatrix(data.clusters[:10]) @@ -203,57 +203,73 @@ def test_errors(self, data): class TestGMMCovariance(object): def test_homoskedastic(self, data): - c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, 'unadjusted') + c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, "unadjusted") s = HomoskedasticWeightMatrix().weight_matrix(data.x, data.z, data.e) x, z = data.x, data.z xzwswzx = x.T @ z @ s @ z.T @ x / data.nobs cov = data.xzizx_inv @ xzwswzx @ data.xzizx_inv cov = (cov + cov.T) / 2 assert_allclose(c.cov, cov) - assert c.config['debiased'] is False + assert c.config["debiased"] is False def test_heteroskedastic(self, data): - c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, 'robust') + c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, "robust") s = HeteroskedasticWeightMatrix().weight_matrix(data.x, data.z, data.e) x, z = data.x, data.z xzwswzx = x.T @ z @ s @ z.T @ x / data.nobs cov = data.xzizx_inv @ xzwswzx @ data.xzizx_inv cov = (cov + cov.T) / 2 assert_allclose(c.cov, cov) - assert c.config['debiased'] is False + assert c.config["debiased"] is False def test_clustered(self, data): - c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, 'clustered', - clusters=data.clusters) - s = OneWayClusteredWeightMatrix(clusters=data.clusters).weight_matrix(data.x, data.z, - data.e) + c = IVGMMCovariance( + data.x, + data.y, + data.z, + data.params, + data.i, + "clustered", + clusters=data.clusters, + ) + s = OneWayClusteredWeightMatrix(clusters=data.clusters).weight_matrix( + data.x, data.z, data.e + ) x, z = data.x, data.z xzwswzx = x.T @ z @ s @ z.T @ x / data.nobs cov = data.xzizx_inv @ xzwswzx @ data.xzizx_inv cov = (cov + cov.T) / 2 assert_allclose(c.cov, cov) - assert c.config['debiased'] is False - assert_equal(c.config['clusters'], data.clusters) - c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, 'clustered') - assert 'Clustered' in str(c) + assert c.config["debiased"] is False + assert_equal(c.config["clusters"], data.clusters) + c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, "clustered") + assert "Clustered" in str(c) def test_kernel(self, data, kernel, bandwidth): - c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, 'kernel', - kernel=kernel.kernel, bandwidth=bandwidth) - s = KernelWeightMatrix(kernel=kernel.kernel, bandwidth=bandwidth).weight_matrix(data.x, - data.z, - data.e) + c = IVGMMCovariance( + data.x, + data.y, + data.z, + data.params, + data.i, + "kernel", + kernel=kernel.kernel, + bandwidth=bandwidth, + ) + s = KernelWeightMatrix(kernel=kernel.kernel, bandwidth=bandwidth).weight_matrix( + data.x, data.z, data.e + ) x, z, nobs = data.x, data.z, data.nobs xzwswzx = x.T @ z @ s @ z.T @ x / data.nobs cov = data.xzizx_inv @ xzwswzx @ data.xzizx_inv cov = (cov + cov.T) / 2 assert_allclose(c.cov, cov) - assert c.config['kernel'] == kernel.kernel - assert c.config['debiased'] is False - assert c.config['bandwidth'] == bandwidth or nobs - 2 - c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, 'kernel') - assert 'Kernel' in str(c) + assert c.config["kernel"] == kernel.kernel + assert c.config["debiased"] is False + assert c.config["bandwidth"] == bandwidth or nobs - 2 + c = IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, "kernel") + assert "Kernel" in str(c) def test_unknown(self, data): with pytest.raises(ValueError): - IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, 'unknown').cov + IVGMMCovariance(data.x, data.y, data.z, data.params, data.i, "unknown").cov diff --git a/linearmodels/tests/iv/test_missing_data.py b/linearmodels/tests/iv/test_missing_data.py index e5c4a0043d..b679758372 100644 --- a/linearmodels/tests/iv/test_missing_data.py +++ b/linearmodels/tests/iv/test_missing_data.py @@ -6,15 +6,17 @@ from linearmodels.iv import IV2SLS, IVGMM, IVGMMCUE, IVLIML from linearmodels.utility import AttrDict -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) -@pytest.fixture(scope='module', params=[IV2SLS, IVLIML, IVGMM, IVGMMCUE]) +@pytest.fixture(scope="module", params=[IV2SLS, IVLIML, IVGMM, IVGMMCUE]) def model(request): return request.param -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): n, q, k, p = 1000, 2, 5, 3 np.random.seed(12345) @@ -29,7 +31,7 @@ def data(): v = np.random.multivariate_normal(np.zeros(r.shape[0]), r, n) v.flat[::93] = np.nan x = v[:, :k] - z = v[:, k:k + p] + z = v[:, k : k + p] e = v[:, [-1]] params = np.arange(1, k + 1) / k params = params[:, None] @@ -49,25 +51,32 @@ def data(): endog_clean = x_clean[:, :q] instr_clean = z_clean clusters_clean = clusters[not_missing] - return AttrDict(dep=dep, exog=exog, endog=endog, instr=instr, - dep_clean=dep_clean, exog_clean=exog_clean, - endog_clean=endog_clean, instr_clean=instr_clean, - clusters=clusters, clusters_clean=clusters_clean) + return AttrDict( + dep=dep, + exog=exog, + endog=endog, + instr=instr, + dep_clean=dep_clean, + exog_clean=exog_clean, + endog_clean=endog_clean, + instr_clean=instr_clean, + clusters=clusters, + clusters_clean=clusters_clean, + ) def get_all(v): - attr = [d for d in dir(v) if not d.startswith('_')] + attr = [d for d in dir(v) if not d.startswith("_")] for a in attr: val = getattr(v, a) - if a in ('conf_int', 'durbin', 'wu_hausman', 'c_stat'): + if a in ("conf_int", "durbin", "wu_hausman", "c_stat"): val() def test_missing(data, model): mod = model(data.dep, data.exog, data.endog, data.instr) res = mod.fit() - mod = model(data.dep_clean, data.exog_clean, - data.endog_clean, data.instr_clean) + mod = model(data.dep_clean, data.exog_clean, data.endog_clean, data.instr_clean) res2 = mod.fit() assert res.nobs == res2.nobs assert_series_equal(res.params, res2.params) @@ -77,11 +86,10 @@ def test_missing(data, model): def test_missing_clustered(data): mod = IV2SLS(data.dep, data.exog, data.endog, data.instr) with pytest.raises(ValueError): - mod.fit(cov_type='clustered', clusters=data.clusters) - res = mod.fit(cov_type='clustered', clusters=data.clusters_clean) - mod = IV2SLS(data.dep_clean, data.exog_clean, - data.endog_clean, data.instr_clean) - res2 = mod.fit(cov_type='clustered', clusters=data.clusters_clean) + mod.fit(cov_type="clustered", clusters=data.clusters) + res = mod.fit(cov_type="clustered", clusters=data.clusters_clean) + mod = IV2SLS(data.dep_clean, data.exog_clean, data.endog_clean, data.instr_clean) + res2 = mod.fit(cov_type="clustered", clusters=data.clusters_clean) assert res.nobs == res2.nobs assert_series_equal(res.params, res2.params) get_all(res) diff --git a/linearmodels/tests/iv/test_model.py b/linearmodels/tests/iv/test_model.py index 1e98dcbf32..4ed0114b7f 100644 --- a/linearmodels/tests/iv/test_model.py +++ b/linearmodels/tests/iv/test_model.py @@ -16,7 +16,7 @@ from linearmodels.utility import AttrDict -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): n, q, k, p = 1000, 2, 5, 3 np.random.seed(12345) @@ -30,7 +30,7 @@ def data(): r += np.eye(9) * 0.5 v = np.random.multivariate_normal(np.zeros(r.shape[0]), r, n) x = v[:, :k] - z = v[:, k:k + p] + z = v[:, k : k + p] e = v[:, [-1]] params = np.arange(1, k + 1) / k params = params[:, None] @@ -43,18 +43,34 @@ def data(): vinv = np.linalg.inv(v) kappa = 0.99 vk = (x.T @ x * (1 - kappa) + kappa * xhat.T @ xhat) / nobs - return AttrDict(nobs=nobs, e=e, x=x, y=y, z=z, xhat=xhat, - params=params, s2=s2, s2_debiased=s2_debiased, - clusters=clusters, nvar=nvar, v=v, vinv=vinv, vk=vk, - kappa=kappa, dep=y, exog=x[:, q:], endog=x[:, :q], - instr=z) + return AttrDict( + nobs=nobs, + e=e, + x=x, + y=y, + z=z, + xhat=xhat, + params=params, + s2=s2, + s2_debiased=s2_debiased, + clusters=clusters, + nvar=nvar, + v=v, + vinv=vinv, + vk=vk, + kappa=kappa, + dep=y, + exog=x[:, q:], + endog=x[:, :q], + instr=z, + ) def get_all(v): - attr = [d for d in dir(v) if not d.startswith('_')] + attr = [d for d in dir(v) if not d.startswith("_")] for a in attr: val = getattr(v, a) - if a in ('conf_int', 'durbin', 'wu_hausman', 'c_stat'): + if a in ("conf_int", "durbin", "wu_hausman", "c_stat"): val() @@ -102,12 +118,14 @@ def test_kappa_fuller_warning(self, data): def test_string_cat(self, data): instr = data.instr.copy() n = data.instr.shape[0] - cat = pd.Series(['a'] * (n // 2) + ['b'] * (n // 2)) + cat = pd.Series(["a"] * (n // 2) + ["b"] * (n // 2)) instr = pd.DataFrame(instr) - instr['cat'] = cat - res = IV2SLS(data.dep, data.exog, data.endog, instr).fit(cov_type='unadjusted') - instr['cat'] = cat.astype('category') - res_cat = IV2SLS(data.dep, data.exog, data.endog, instr).fit(cov_type='unadjusted') + instr["cat"] = cat + res = IV2SLS(data.dep, data.exog, data.endog, instr).fit(cov_type="unadjusted") + instr["cat"] = cat.astype("category") + res_cat = IV2SLS(data.dep, data.exog, data.endog, instr).fit( + cov_type="unadjusted" + ) assert_series_equal(res.params, res_cat.params) def test_no_regressors(self, data): @@ -260,6 +278,7 @@ def test_model_summary_smoke(data): def test_model_missing(data): import copy + data2 = AttrDict() for key in data: data2[key] = copy.deepcopy(data[key]) @@ -290,24 +309,20 @@ def test_compare(data): c = compare([res1, res2, res3, res4]) assert len(c.rsquared) == 4 c.summary - c = compare({'Model A': res1, - 'Model B': res2, - 'Model C': res3, - 'Model D': res4}) + c = compare({"Model A": res1, "Model B": res2, "Model C": res3, "Model D": res4}) c.summary res = OrderedDict() - res['Model A'] = res1 - res['Model B'] = res2 - res['Model C'] = res3 - res['Model D'] = res4 + res["Model A"] = res1 + res["Model B"] = res2 + res["Model C"] = res3 + res["Model D"] = res4 c = compare(res) c.summary c.pvalues res1 = IV2SLS(data.dep, data.exog[:, :1], None, None).fit() res2 = IV2SLS(data.dep, data.exog[:, :2], None, None).fit() - c = compare({'Model A': res1, - 'Model B': res2}) + c = compare({"Model A": res1, "Model B": res2}) c.summary @@ -316,10 +331,10 @@ def test_compare_single(data): c = compare([res1]) assert len(c.rsquared) == 1 c.summary - c = compare({'Model A': res1}) + c = compare({"Model A": res1}) c.summary res = OrderedDict() - res['Model A'] = res1 + res["Model A"] = res1 c = compare(res) c.summary c.pvalues @@ -339,18 +354,18 @@ def test_first_stage_summary(data): def test_gmm_str(data): mod = IVGMM(data.dep, data.exog, data.endog, data.instr) - str(mod.fit(cov_type='unadjusted')) - str(mod.fit(cov_type='robust')) - str(mod.fit(cov_type='clustered', clusters=data.clusters)) - str(mod.fit(cov_type='kernel')) + str(mod.fit(cov_type="unadjusted")) + str(mod.fit(cov_type="robust")) + str(mod.fit(cov_type="clustered", clusters=data.clusters)) + str(mod.fit(cov_type="kernel")) def test_gmm_cue_optimization_options(data): mod = IVGMMCUE(data.dep, data.exog, data.endog, data.instr) res_none = mod.fit(display=False) - opt_options = dict(method='BFGS', options={'disp': False}) + opt_options = dict(method="BFGS", options={"disp": False}) res_bfgs = mod.fit(display=False, opt_options=opt_options) - opt_options = dict(method='L-BFGS-B', options={'disp': False}) + opt_options = dict(method="L-BFGS-B", options={"disp": False}) res_lbfgsb = mod.fit(display=False, opt_options=opt_options) assert res_none.iterations > 2 assert res_bfgs.iterations > 2 diff --git a/linearmodels/tests/iv/test_postestimation.py b/linearmodels/tests/iv/test_postestimation.py index 8ad58f4f43..a61e859264 100644 --- a/linearmodels/tests/iv/test_postestimation.py +++ b/linearmodels/tests/iv/test_postestimation.py @@ -11,28 +11,32 @@ CWD = os.path.split(os.path.abspath(__file__))[0] -HOUSING_DATA = pd.read_csv(os.path.join(CWD, 'results', 'housing.csv'), index_col=0) -HOUSING_DATA.region = HOUSING_DATA.region.astype('category') -HOUSING_DATA.state = HOUSING_DATA.state.astype('category') -HOUSING_DATA.division = HOUSING_DATA.division.astype('category') +HOUSING_DATA = pd.read_csv(os.path.join(CWD, "results", "housing.csv"), index_col=0) +HOUSING_DATA.region = HOUSING_DATA.region.astype("category") +HOUSING_DATA.state = HOUSING_DATA.state.astype("category") +HOUSING_DATA.division = HOUSING_DATA.division.astype("category") -SIMULATED_DATA = pd.read_stata(os.path.join(CWD, 'results', 'simulated-data.dta')) +SIMULATED_DATA = pd.read_stata(os.path.join(CWD, "results", "simulated-data.dta")) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): - return AttrDict(dep=SIMULATED_DATA.y_robust, - exog=add_constant(SIMULATED_DATA[['x3', 'x4', 'x5']]), - endog=SIMULATED_DATA[['x1', 'x2']], - instr=SIMULATED_DATA[['z1', 'z2']]) + return AttrDict( + dep=SIMULATED_DATA.y_robust, + exog=add_constant(SIMULATED_DATA[["x3", "x4", "x5"]]), + endog=SIMULATED_DATA[["x1", "x2"]], + instr=SIMULATED_DATA[["z1", "z2"]], + ) def test_sargan(data): # Stata code: # ivregress 2sls y_robust x3 x4 x5 (x1=z1 z2) # estat overid - res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='unadjusted') - assert_allclose(res.sargan.stat, .176535, rtol=1e-4) + res = IV2SLS(data.dep, data.exog, data.endog[["x1"]], data.instr).fit( + cov_type="unadjusted" + ) + assert_allclose(res.sargan.stat, 0.176535, rtol=1e-4) assert_allclose(res.sargan.pval, 0.6744, rtol=1e-4) @@ -40,75 +44,85 @@ def test_basmann(data): # Stata code: # ivregress 2sls y_robust x3 x4 x5 (x1=z1 z2) # estat overid - res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='unadjusted') - assert_allclose(res.basmann.stat, .174822, rtol=1e-4) + res = IV2SLS(data.dep, data.exog, data.endog[["x1"]], data.instr).fit( + cov_type="unadjusted" + ) + assert_allclose(res.basmann.stat, 0.174822, rtol=1e-4) assert_allclose(res.basmann.pval, 0.6759, rtol=1e-3) def test_durbin(data): - res = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit(cov_type='unadjusted') + res = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit(cov_type="unadjusted") assert_allclose(res.durbin().stat, 35.1258, rtol=1e-4) assert_allclose(res.durbin().pval, 0.0000, atol=1e-6) - assert_allclose(res.durbin('x1').stat, .156341, rtol=1e-4) - assert_allclose(res.durbin('x1').pval, 0.6925, rtol=1e-3) + assert_allclose(res.durbin("x1").stat, 0.156341, rtol=1e-4) + assert_allclose(res.durbin("x1").pval, 0.6925, rtol=1e-3) def test_wu_hausman(data): - res = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit(cov_type='unadjusted') + res = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit(cov_type="unadjusted") assert_allclose(res.wu_hausman().stat, 18.4063, rtol=1e-4) assert_allclose(res.wu_hausman().pval, 0.0000, atol=1e-6) - assert_allclose(res.wu_hausman('x1').stat, .154557, rtol=1e-4) - assert_allclose(res.wu_hausman('x1').pval, 0.6944, rtol=1e-3) + assert_allclose(res.wu_hausman("x1").stat, 0.154557, rtol=1e-4) + assert_allclose(res.wu_hausman("x1").pval, 0.6944, rtol=1e-3) def test_wooldridge_score(data): - res = IV2SLS(data.dep, data.exog, data.endog[['x1', 'x2']], data.instr).fit(cov_type='robust') + res = IV2SLS(data.dep, data.exog, data.endog[["x1", "x2"]], data.instr).fit( + cov_type="robust" + ) assert_allclose(res.wooldridge_score.stat, 22.684, rtol=1e-4) assert_allclose(res.wooldridge_score.pval, 0.0000, atol=1e-4) def test_wooldridge_regression(data): - mod = IV2SLS(data.dep, data.exog, data.endog[['x1', 'x2']], data.instr) - res = mod.fit(cov_type='robust', debiased=True) + mod = IV2SLS(data.dep, data.exog, data.endog[["x1", "x2"]], data.instr) + res = mod.fit(cov_type="robust", debiased=True) # Scale to correct for F vs Wald treatment assert_allclose(res.wooldridge_regression.stat, 2 * 13.3461, rtol=1e-4) assert_allclose(res.wooldridge_regression.pval, 0.0000, atol=1e-4) def test_wooldridge_overid(data): - res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='robust') + res = IV2SLS(data.dep, data.exog, data.endog[["x1"]], data.instr).fit( + cov_type="robust" + ) assert_allclose(res.wooldridge_overid.stat, 0.221648, rtol=1e-4) assert_allclose(res.wooldridge_overid.pval, 0.6378, rtol=1e-3) def test_anderson_rubin(data): - res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='unadjusted') - assert_allclose(res.nobs * (res._liml_kappa - 1), .176587, rtol=1e-4) + res = IV2SLS(data.dep, data.exog, data.endog[["x1"]], data.instr).fit( + cov_type="unadjusted" + ) + assert_allclose(res.nobs * (res._liml_kappa - 1), 0.176587, rtol=1e-4) def test_basmann_f(data): - res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='unadjusted') - assert_allclose(res.basmann_f.stat, .174821, rtol=1e-4) + res = IV2SLS(data.dep, data.exog, data.endog[["x1"]], data.instr).fit( + cov_type="unadjusted" + ) + assert_allclose(res.basmann_f.stat, 0.174821, rtol=1e-4) assert_allclose(res.basmann_f.pval, 0.6760, rtol=1e-3) def test_c_stat_smoke(data): - res = IVGMM(data.dep, data.exog, data.endog, data.instr).fit(cov_type='robust') + res = IVGMM(data.dep, data.exog, data.endog, data.instr).fit(cov_type="robust") c_stat = res.c_stat() assert_allclose(c_stat.stat, 22.684, rtol=1e-4) assert_allclose(c_stat.pval, 0.00, atol=1e-3) - c_stat = res.c_stat(['x1']) - assert_allclose(c_stat.stat, .158525, rtol=1e-3) + c_stat = res.c_stat(["x1"]) + assert_allclose(c_stat.stat, 0.158525, rtol=1e-3) assert_allclose(c_stat.pval, 0.6905, rtol=1e-3) # Final test - c_stat2 = res.c_stat('x1') + c_stat2 = res.c_stat("x1") assert_allclose(c_stat.stat, c_stat2.stat) def test_linear_restriction(data): - res = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit(cov_type='robust') + res = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit(cov_type="robust") nvar = len(res.params) q = np.eye(nvar) ts = res.wald_test(q, np.zeros(nvar)) @@ -118,6 +132,6 @@ def test_linear_restriction(data): assert_allclose(stat, ts.stat) assert ts.df == nvar - formula = ' = '.join(res.params.index) + ' = 0' + formula = " = ".join(res.params.index) + " = 0" ts2 = res.wald_test(formula=formula) assert_allclose(ts.stat, ts2.stat) diff --git a/linearmodels/tests/iv/test_results.py b/linearmodels/tests/iv/test_results.py index 05af3ac34e..750caf9d5c 100644 --- a/linearmodels/tests/iv/test_results.py +++ b/linearmodels/tests/iv/test_results.py @@ -10,21 +10,21 @@ from linearmodels.tests.panel._utility import assert_frame_similar -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): return generate_data() -@pytest.fixture(scope='module', params=[IV2SLS, IVLIML, IVGMM, IVGMMCUE]) +@pytest.fixture(scope="module", params=[IV2SLS, IVLIML, IVGMM, IVGMMCUE]) def model(request): return request.param def result_checker(res): for attr in dir(res): - if attr.startswith('_') or attr in ('test_linear_constraint', 'wald_test'): + if attr.startswith("_") or attr in ("test_linear_constraint", "wald_test"): continue - if attr == 'first_stage': + if attr == "first_stage": result_checker(getattr(res, attr)) attr = getattr(res, attr) if callable(attr): @@ -36,28 +36,28 @@ def result_checker(res): def test_results(data, model): mod = model(data.dep, data.exog, data.endog, data.instr) - result_checker(mod.fit(cov_type='unadjusted')) - result_checker(mod.fit(cov_type='robust')) - result_checker(mod.fit(cov_type='kernel')) - result_checker(mod.fit(cov_type='clustered', clusters=data.clusters)) + result_checker(mod.fit(cov_type="unadjusted")) + result_checker(mod.fit(cov_type="robust")) + result_checker(mod.fit(cov_type="kernel")) + result_checker(mod.fit(cov_type="clustered", clusters=data.clusters)) result_checker(model(data.dep, data.exog, None, None).fit()) def test_results_single(data, model): mod = model(data.dep, data.exog[:, 0], data.endog[:, 0], data.instr[:, 0]) - result_checker(mod.fit(cov_type='unadjusted')) - result_checker(mod.fit(cov_type='robust')) - result_checker(mod.fit(cov_type='kernel')) - result_checker(mod.fit(cov_type='clustered', clusters=data.clusters)) + result_checker(mod.fit(cov_type="unadjusted")) + result_checker(mod.fit(cov_type="robust")) + result_checker(mod.fit(cov_type="kernel")) + result_checker(mod.fit(cov_type="clustered", clusters=data.clusters)) def test_results_no_exog(data, model): mod = model(data.dep, None, data.endog[:, 0], data.instr[:, 0]) - result_checker(mod.fit(cov_type='unadjusted')) - result_checker(mod.fit(cov_type='robust')) - result_checker(mod.fit(cov_type='kernel')) - result_checker(mod.fit(cov_type='clustered', clusters=data.clusters)) + result_checker(mod.fit(cov_type="unadjusted")) + result_checker(mod.fit(cov_type="robust")) + result_checker(mod.fit(cov_type="kernel")) + result_checker(mod.fit(cov_type="clustered", clusters=data.clusters)) def test_fitted_predict(data, model): @@ -66,7 +66,7 @@ def test_fitted_predict(data, model): assert_series_equal(res.idiosyncratic, res.resids) y = mod.dependent.pandas expected = asarray(y) - asarray(res.resids)[:, None] - expected = DataFrame(expected, y.index, ['fitted_values']) + expected = DataFrame(expected, y.index, ["fitted_values"]) assert_frame_similar(expected, res.fitted_values) assert_allclose(expected, res.fitted_values) pred = res.predict() @@ -76,7 +76,7 @@ def test_fitted_predict(data, model): pred = res.predict(idiosyncratic=True, missing=True) nobs = IVData(data.dep).pandas.shape[0] assert pred.shape == (nobs, 2) - assert list(pred.columns) == ['fitted_values', 'residual'] + assert list(pred.columns) == ["fitted_values", "residual"] def test_fitted_predict_exception(data, model): diff --git a/linearmodels/tests/panel/_utility.py b/linearmodels/tests/panel/_utility.py index 979c68c83d..77601d1fdc 100644 --- a/linearmodels/tests/panel/_utility.py +++ b/linearmodels/tests/panel/_utility.py @@ -16,13 +16,14 @@ except ImportError: MISSING_XARRAY = True -datatypes = ['numpy', 'pandas'] +datatypes = ["numpy", "pandas"] if not MISSING_XARRAY: - datatypes += ['xarray'] + datatypes += ["xarray"] -def lsdv(y: DataFrame, x: DataFrame, has_const=False, entity=False, time=False, - general=None): +def lsdv( + y: DataFrame, x: DataFrame, has_const=False, entity=False, time=False, general=None +): nvar = x.shape[1] temp = x.reset_index() cat_index = temp.index @@ -30,23 +31,29 @@ def lsdv(y: DataFrame, x: DataFrame, has_const=False, entity=False, time=False, cat = Categorical(temp.iloc[:, 0]) cat.index = cat_index dummies = get_dummies(cat, drop_first=has_const) - x = DataFrame(np.c_[x.values, dummies.values.astype(np.float64)], - index=x.index, - columns=list(x.columns) + list(dummies.columns)) + x = DataFrame( + np.c_[x.values, dummies.values.astype(np.float64)], + index=x.index, + columns=list(x.columns) + list(dummies.columns), + ) if time: cat = Categorical(temp.iloc[:, 1]) cat.index = cat_index dummies = get_dummies(cat, drop_first=(has_const or entity)) - x = DataFrame(np.c_[x.values, dummies.values.astype(np.float64)], - index=x.index, - columns=list(x.columns) + list(dummies.columns)) + x = DataFrame( + np.c_[x.values, dummies.values.astype(np.float64)], + index=x.index, + columns=list(x.columns) + list(dummies.columns), + ) if general is not None: cat = Categorical(general) cat.index = cat_index dummies = get_dummies(cat, drop_first=(has_const or entity or time)) - x = DataFrame(np.c_[x.values, dummies.values.astype(np.float64)], - index=x.index, - columns=list(x.columns) + list(dummies.columns)) + x = DataFrame( + np.c_[x.values, dummies.values.astype(np.float64)], + index=x.index, + columns=list(x.columns) + list(dummies.columns), + ) w = np.ones_like(y) wy = w * y.values @@ -57,8 +64,15 @@ def lsdv(y: DataFrame, x: DataFrame, has_const=False, entity=False, time=False, return params[:nvar] -def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects=0, rng=None, - num_cats=4): +def generate_data( + missing, + datatype, + const=False, + ntk=(971, 7, 5), + other_effects=0, + rng=None, + num_cats=4, +): if rng is None: np.random.seed(12345) else: @@ -72,9 +86,9 @@ def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects w = np.random.chisquare(5, (t, n)) / 5 c = None if other_effects == 1: - cats = ['Industries'] + cats = ["Industries"] else: - cats = ['cat.' + str(i) for i in range(other_effects)] + cats = ["cat." + str(i) for i in range(other_effects)] if other_effects: if not isinstance(num_cats, list): num_cats = [num_cats] * other_effects @@ -84,7 +98,7 @@ def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects c.append(np.random.randint(0, nc, (1, t, n))) c = np.concatenate(c, 0) - vcats = ['varcat.' + str(i) for i in range(2)] + vcats = ["varcat." + str(i) for i in range(2)] vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] @@ -97,51 +111,70 @@ def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects locs = np.random.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan - if datatype in ('pandas', 'xarray'): - entities = ['firm' + str(i) for i in range(n)] - time = date_range('1-1-1900', periods=t, freq='A-DEC') - var_names = ['x' + str(i) for i in range(k)] + if datatype in ("pandas", "xarray"): + entities = ["firm" + str(i) for i in range(n)] + time = date_range("1-1-1900", periods=t, freq="A-DEC") + var_names = ["x" + str(i) for i in range(k)] # y = DataFrame(y, index=time, columns=entities) - y = panel_to_frame(y[None], items=['y'], major_axis=time, minor_axis=entities, swap=True) - w = panel_to_frame(w[None], items=['w'], major_axis=time, minor_axis=entities, swap=True) + y = panel_to_frame( + y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True + ) + w = panel_to_frame( + w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True + ) w = w.reindex(y.index) - x = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) + x = panel_to_frame( + x, items=var_names, major_axis=time, minor_axis=entities, swap=True + ) x = x.reindex(y.index) - c = panel_to_frame(c, items=cats, major_axis=time, minor_axis=entities, swap=True) + c = panel_to_frame( + c, items=cats, major_axis=time, minor_axis=entities, swap=True + ) c = c.reindex(y.index) - vc1 = panel_to_frame(vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True) + vc1 = panel_to_frame( + vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True + ) vc1 = vc1.reindex(y.index) - vc2 = panel_to_frame(vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True) + vc2 = panel_to_frame( + vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True + ) vc2 = vc2.reindex(y.index) - if datatype == 'xarray': + if datatype == "xarray": # TODO: This is broken now, need to transfor multiindex to xarray 3d import xarray as xr - x = xr.DataArray(PanelData(x).values3d, - coords={'entities': entities, 'time': time, - 'vars': var_names}, - dims=['vars', 'time', 'entities']) - y = xr.DataArray(PanelData(y).values3d, - coords={'entities': entities, 'time': time, - 'vars': ['y']}, - dims=['vars', 'time', 'entities']) - w = xr.DataArray(PanelData(w).values3d, - coords={'entities': entities, 'time': time, - 'vars': ['w']}, - dims=['vars', 'time', 'entities']) + + x = xr.DataArray( + PanelData(x).values3d, + coords={"entities": entities, "time": time, "vars": var_names}, + dims=["vars", "time", "entities"], + ) + y = xr.DataArray( + PanelData(y).values3d, + coords={"entities": entities, "time": time, "vars": ["y"]}, + dims=["vars", "time", "entities"], + ) + w = xr.DataArray( + PanelData(w).values3d, + coords={"entities": entities, "time": time, "vars": ["w"]}, + dims=["vars", "time", "entities"], + ) if c.shape[1] > 0: - c = xr.DataArray(PanelData(c).values3d, - coords={'entities': entities, 'time': time, - 'vars': c.columns}, - dims=['vars', 'time', 'entities']) - vc1 = xr.DataArray(PanelData(vc1).values3d, - coords={'entities': entities, 'time': time, - 'vars': vc1.columns}, - dims=['vars', 'time', 'entities']) - vc2 = xr.DataArray(PanelData(vc2).values3d, - coords={'entities': entities, 'time': time, - 'vars': vc2.columns}, - dims=['vars', 'time', 'entities']) + c = xr.DataArray( + PanelData(c).values3d, + coords={"entities": entities, "time": time, "vars": c.columns}, + dims=["vars", "time", "entities"], + ) + vc1 = xr.DataArray( + PanelData(vc1).values3d, + coords={"entities": entities, "time": time, "vars": vc1.columns}, + dims=["vars", "time", "entities"], + ) + vc2 = xr.DataArray( + PanelData(vc2).values3d, + coords={"entities": entities, "time": time, "vars": vc2.columns}, + dims=["vars", "time", "entities"], + ) if rng is not None: rng.set_state(np.random.get_state()) @@ -152,27 +185,34 @@ def generate_data(missing, datatype, const=False, ntk=(971, 7, 5), other_effects def assert_results_equal(res1, res2, test_fit=True, test_df=True, strict=True): n = min(res1.params.shape[0], res2.params.shape[0]) - assert_series_equal(res1.params.iloc[:n], res2.params.iloc[:n], - check_less_precise=not strict) - assert_series_equal(res1.pvalues.iloc[:n], res2.pvalues.iloc[:n], - check_less_precise=not strict) - assert_series_equal(res1.tstats.iloc[:n], res2.tstats.iloc[:n], - check_less_precise=not strict) - assert_frame_equal(res1.cov.iloc[:n, :n], res2.cov.iloc[:n, :n], - check_less_precise=not strict) - assert_frame_equal(res1.conf_int().iloc[:n], res2.conf_int().iloc[:n], - check_less_precise=not strict) + assert_series_equal( + res1.params.iloc[:n], res2.params.iloc[:n], check_less_precise=not strict + ) + assert_series_equal( + res1.pvalues.iloc[:n], res2.pvalues.iloc[:n], check_less_precise=not strict + ) + assert_series_equal( + res1.tstats.iloc[:n], res2.tstats.iloc[:n], check_less_precise=not strict + ) + assert_frame_equal( + res1.cov.iloc[:n, :n], res2.cov.iloc[:n, :n], check_less_precise=not strict + ) + assert_frame_equal( + res1.conf_int().iloc[:n], + res2.conf_int().iloc[:n], + check_less_precise=not strict, + ) assert_allclose(res1.s2, res2.s2) rtol = 1e-7 if strict else 1e-4 delta = 1 + (res1.resids.values - res2.resids.values) / max( - res1.resids.std(), - res2.resids.std()) + res1.resids.std(), res2.resids.std() + ) assert_allclose(delta, np.ones_like(delta), rtol=rtol) delta = 1 + (res1.wresids.values - res2.wresids.values) / max( - res1.wresids.std(), - res2.wresids.std()) + res1.wresids.std(), res2.wresids.std() + ) assert_allclose(delta, np.ones_like(delta), rtol=rtol) if test_df: @@ -197,7 +237,7 @@ def assert_frame_similar(result, expected): def access_attributes(result): d = dir(result) for key in d: - if not key.startswith('_') and key not in ('wald_test',): + if not key.startswith("_") and key not in ("wald_test",): val = getattr(result, key) if callable(val): val() diff --git a/linearmodels/tests/panel/results/execute-stata-simulated-data.py b/linearmodels/tests/panel/results/execute-stata-simulated-data.py index c9074559d5..ab7b140700 100644 --- a/linearmodels/tests/panel/results/execute-stata-simulated-data.py +++ b/linearmodels/tests/panel/results/execute-stata-simulated-data.py @@ -3,32 +3,34 @@ from os.path import join import subprocess -STATA_PATH = join('C:\\', 'Program Files (x86)', 'Stata13', 'StataMP-64.exe') +STATA_PATH = join("C:\\", "Program Files (x86)", "Stata13", "StataMP-64.exe") -dtafile = join(os.getcwd(), 'simulated-panel.dta') +dtafile = join(os.getcwd(), "simulated-panel.dta") # Permutations # estimator -> be, fe, or regress to match pooled # datasets, (nothing), _light, _heavy # vce options -> conventional (be, fe, re), robust(re, fe, *regress*), ols(*regress*) -configs = {'xtreg {vars}, be vce(conventional)': 'between-conventional-', - 'xtreg {vars}, be wls vce(conventional)': 'between-conventional-wls', - 'xtreg {vars}, fe vce(conventional)': 'fixed_effect-conventional-', - 'xtreg {vars}, fe vce(robust)': 'fixed_effect-robust-', - 'xtreg {vars}, fe vce(cluster firm_id)': 'fixed_effect-cluster-', - 'xtreg {vars}, re vce(conventional)': 'random_effect-conventional-', - 'xtreg {vars}, re vce(robust)': 'random_effect-robust-', - 'xtreg {vars}, re vce(cluster firm_id)': 'random_effect-cluster-', - 'xtreg {vars} [aweight=w], fe vce(conventional)': 'fixed_effect-conventional-weighted', - 'xtreg {vars} [aweight=w], fe vce(robust)': 'fixed_effect-robust-weighted', - 'xtreg {vars} [aweight=w], fe vce(cluster firm_id)': 'fixed_effect-cluster-weighted', - 'regress {vars}, vce(ols)': 'pooled-conventional-', - 'regress {vars}, vce(robust)': 'pooled-robust-', - 'regress {vars}, vce(cluster firm_id)': 'pooled-cluster-', - 'regress {vars} [aweight=w], vce(ols)': 'pooled-conventional-weighted', - 'regress {vars} [aweight=w], vce(robust)': 'pooled-robust-weighted', - 'regress {vars} [aweight=w], vce(cluster firm_id)': 'pooled-cluster-weighted'} +configs = { + "xtreg {vars}, be vce(conventional)": "between-conventional-", + "xtreg {vars}, be wls vce(conventional)": "between-conventional-wls", + "xtreg {vars}, fe vce(conventional)": "fixed_effect-conventional-", + "xtreg {vars}, fe vce(robust)": "fixed_effect-robust-", + "xtreg {vars}, fe vce(cluster firm_id)": "fixed_effect-cluster-", + "xtreg {vars}, re vce(conventional)": "random_effect-conventional-", + "xtreg {vars}, re vce(robust)": "random_effect-robust-", + "xtreg {vars}, re vce(cluster firm_id)": "random_effect-cluster-", + "xtreg {vars} [aweight=w], fe vce(conventional)": "fixed_effect-conventional-weighted", + "xtreg {vars} [aweight=w], fe vce(robust)": "fixed_effect-robust-weighted", + "xtreg {vars} [aweight=w], fe vce(cluster firm_id)": "fixed_effect-cluster-weighted", + "regress {vars}, vce(ols)": "pooled-conventional-", + "regress {vars}, vce(robust)": "pooled-robust-", + "regress {vars}, vce(cluster firm_id)": "pooled-cluster-", + "regress {vars} [aweight=w], vce(ols)": "pooled-conventional-weighted", + "regress {vars} [aweight=w], vce(robust)": "pooled-robust-weighted", + "regress {vars} [aweight=w], vce(cluster firm_id)": "pooled-cluster-weighted", +} od = OrderedDict() # type: OrderedDict for key in sorted(configs.keys()): @@ -39,11 +41,13 @@ start = """ use {dtafile}, clear \n xtset firm_id time \n -""".format(dtafile=dtafile) +""".format( + dtafile=dtafile +) -_sep = '#################!{config}-{ending}!####################' -endings = ['', '_light', '_heavy'] -variables = ['y', 'x1', 'x2', 'x3', 'x4', 'x5'] +_sep = "#################!{config}-{ending}!####################" +endings = ["", "_light", "_heavy"] +variables = ["y", "x1", "x2", "x3", "x4", "x5"] results = """ estout using {outfile}, cells(b(fmt(%13.12g)) t(fmt(%13.12g)) p(fmt(%13.12g))) """ @@ -64,25 +68,25 @@ file close myfile """ -outfile = os.path.join(os.getcwd(), 'stata-panel-simulated-results.txt') +outfile = os.path.join(os.getcwd(), "stata-panel-simulated-results.txt") if os.path.exists(outfile): os.unlink(outfile) -with open('simulated-results.do', 'w') as stata: +with open("simulated-results.do", "w") as stata: stata.write(start) for config in configs: descr = configs[config] for ending in endings: - _vars = ' '.join([v + ending for v in variables]) + _vars = " ".join([v + ending for v in variables]) command = config.format(vars=_vars) sep = _sep.format(config=descr, ending=ending) stata.write(section_header.format(outfile=outfile, separator=sep)) - stata.write(command + '\n') + stata.write(command + "\n") stata.write(results.format(outfile=outfile)) - stata.write('\n' * 4) + stata.write("\n" * 4) -do_file = join(os.getcwd(), 'simulated-results.do') -cmd = [STATA_PATH, '/e', 'do', do_file] -print(' '.join(cmd)) +do_file = join(os.getcwd(), "simulated-results.do") +cmd = [STATA_PATH, "/e", "do", do_file] +print(" ".join(cmd)) subprocess.call(cmd) diff --git a/linearmodels/tests/panel/results/generate-panel-data.py b/linearmodels/tests/panel/results/generate-panel-data.py index 25d09ad932..9cd7956d3a 100644 --- a/linearmodels/tests/panel/results/generate-panel-data.py +++ b/linearmodels/tests/panel/results/generate-panel-data.py @@ -19,14 +19,14 @@ w = np.ones((t, 1)) @ w w = w / w.mean() -items = ['x' + str(i) for i in range(1, k + 1)] -items = ['intercept'] + items -major = pd.date_range('12-31-1999', periods=t, freq='A-DEC') -minor = ['firm.' + str(i) for i in range(1, n + 1)] +items = ["x" + str(i) for i in range(1, k + 1)] +items = ["intercept"] + items +major = pd.date_range("12-31-1999", periods=t, freq="A-DEC") +minor = ["firm." + str(i) for i in range(1, n + 1)] x = panel_to_frame(x, items, major, minor, swap=True) -y = panel_to_frame(y[None, :], ['y'], major, minor, swap=True) -w = panel_to_frame(w[None, :], ['w'], major, minor, swap=True) +y = panel_to_frame(y[None, :], ["y"], major, minor, swap=True) +w = panel_to_frame(w[None, :], ["w"], major, minor, swap=True) x = PanelData(x) y = PanelData(y) @@ -34,29 +34,29 @@ z = concat([x.dataframe, y.dataframe, w.dataframe], 1) final_index = pd.MultiIndex.from_product([minor, major]) -final_index.levels[0].name = 'firm' +final_index.levels[0].name = "firm" z = z.reindex(final_index) -z.index.levels[0].name = 'firm' -z.index.levels[1].name = 'time' +z.index.levels[0].name = "firm" +z.index.levels[1].name = "time" z = z.reset_index() -z['firm_id'] = z.firm.astype('category') -z['firm_id'] = z.firm_id.cat.codes +z["firm_id"] = z.firm.astype("category") +z["firm_id"] = z.firm_id.cat.codes -vars = ['y', 'x1', 'x2', 'x3', 'x4', 'x5'] +vars = ["y", "x1", "x2", "x3", "x4", "x5"] missing = 0.05 for v in vars: locs = np.random.choice(n * t, int(n * t * missing)) temp = z[v].copy() temp.loc[locs] = np.nan - z[v + '_light'] = temp + z[v + "_light"] = temp -vars = ['y', 'x1', 'x2', 'x3', 'x4', 'x5'] +vars = ["y", "x1", "x2", "x3", "x4", "x5"] missing = 0.20 for v in vars: locs = np.random.choice(n * t, int(n * t * missing)) temp = z[v].copy() temp.loc[locs] = np.nan - z[v + '_heavy'] = temp + z[v + "_heavy"] = temp -z.to_stata('simulated-panel.dta') +z.to_stata("simulated-panel.dta") diff --git a/linearmodels/tests/panel/results/parse_stata_results.py b/linearmodels/tests/panel/results/parse_stata_results.py index ef13977aee..41d65ff168 100644 --- a/linearmodels/tests/panel/results/parse_stata_results.py +++ b/linearmodels/tests/panel/results/parse_stata_results.py @@ -5,22 +5,22 @@ from linearmodels.utility import AttrDict -filename = 'stata-panel-simulated-results.txt' +filename = "stata-panel-simulated-results.txt" cwd = os.path.split(os.path.abspath(__file__))[0] blocks = {} block = [] -key = '' +key = "" with open(os.path.join(cwd, filename)) as results: for line in results.readlines(): line = line.strip() if not line: continue - if '###!' in line: + if "###!" in line: if key: blocks[key] = block block = [] - key = line.split('!')[1] + key = line.split("!")[1] block.append(line) if block: blocks[key] = block @@ -30,30 +30,29 @@ def parse_block(block): params = {} stats = {} for i, line in enumerate(block): - if 'b/t' in line: + if "b/t" in line: params_start = i + 1 - if 'rss' in line: + if "rss" in line: stats_start = i - if '** Variance **' in line: + if "** Variance **" in line: variance_start = i + 1 for i in range(params_start, stats_start, 3): - name, value = block[i].split('\t') + name, value = block[i].split("\t") value = float(value) tstat = float(block[i + 1]) pvalue = float(block[i + 1]) - params[name] = pd.Series( - {'param': value, 'tstat': tstat, 'pvalue': pvalue}) + params[name] = pd.Series({"param": value, "tstat": tstat, "pvalue": pvalue}) params = pd.DataFrame(params).sort_index() for i in range(stats_start, variance_start - 1): - if '\t' in block[i]: - name, value = block[i].split('\t') + if "\t" in block[i]: + name, value = block[i].split("\t") stats[name] = float(value) else: stats[block[i]] = None stats = pd.Series(stats) - var = '\n'.join(block[variance_start + 1:]) - variance = pd.read_csv(StringIO(',' + var.replace('\t', ','))) + var = "\n".join(block[variance_start + 1 :]) + variance = pd.read_csv(StringIO("," + var.replace("\t", ","))) index = variance.pop(variance.columns[0]) index.name = None variance.index = index @@ -70,5 +69,5 @@ def data(): return blocks -if __name__ == '__main__': +if __name__ == "__main__": print(data()) diff --git a/linearmodels/tests/panel/test_between_ols.py b/linearmodels/tests/panel/test_between_ols.py index 0487da1f98..4a08b2e672 100644 --- a/linearmodels/tests/panel/test_between_ols.py +++ b/linearmodels/tests/panel/test_between_ols.py @@ -15,7 +15,9 @@ assert_results_equal, datatypes, generate_data) -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) def data_gen(missing, datatype): @@ -58,17 +60,16 @@ def test_single_entity(data): dep = mod.dependent.dataframe exog = mod.exog.dataframe ols = IV2SLS(dep, exog, None, None) - ols_res = ols.fit(cov_type='unadjusted') + ols_res = ols.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) - res = mod.fit(cov_type='robust', debiased=False) - ols_res = ols.fit(cov_type='robust') + res = mod.fit(cov_type="robust", debiased=False) + ols_res = ols.fit(cov_type="robust") assert_results_equal(res, ols_res) - clusters = pd.DataFrame(np.random.randint(0, 9, dep.shape), - index=dep.index) - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) - ols_res = ols.fit(cov_type='clustered', clusters=clusters) + clusters = pd.DataFrame(np.random.randint(0, 9, dep.shape), index=dep.index) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) + ols_res = ols.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, ols_res) @@ -97,17 +98,16 @@ def test_single_entity_weights(data): dep = mod.dependent.dataframe exog = mod.exog.dataframe ols = IV2SLS(dep, exog, None, None, weights=mod.weights.values2d) - ols_res = ols.fit(cov_type='unadjusted') + ols_res = ols.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) - res = mod.fit(cov_type='robust', debiased=False) - ols_res = ols.fit(cov_type='robust', debiased=False) + res = mod.fit(cov_type="robust", debiased=False) + ols_res = ols.fit(cov_type="robust", debiased=False) assert_results_equal(res, ols_res) - clusters = pd.DataFrame(np.random.randint(0, 9, dep.shape), - index=dep.index) - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) - ols_res = ols.fit(cov_type='clustered', clusters=clusters, debiased=False) + clusters = pd.DataFrame(np.random.randint(0, 9, dep.shape), index=dep.index) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) + ols_res = ols.fit(cov_type="clustered", clusters=clusters, debiased=False) assert_results_equal(res, ols_res) @@ -116,14 +116,13 @@ def test_multiple_obs_per_entity(data): res = mod.fit(reweight=True, debiased=False) dep = mod.dependent.values3d.mean(1).T - exog = pd.DataFrame(mod.exog.values3d.mean(1).T, - columns=mod.exog.vars) + exog = pd.DataFrame(mod.exog.values3d.mean(1).T, columns=mod.exog.vars) ols = IV2SLS(dep, exog, None, None) - ols_res = ols.fit(cov_type='unadjusted') + ols_res = ols.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) - res = mod.fit(cov_type='robust', debiased=False) - ols_res = ols.fit(cov_type='robust', debiased=False) + res = mod.fit(cov_type="robust", debiased=False) + ols_res = ols.fit(cov_type="robust", debiased=False) assert_results_equal(res, ols_res) clusters = mod.dependent.dataframe.copy() @@ -133,8 +132,8 @@ def test_multiple_obs_per_entity(data): clusters.loc[entity] = np.random.randint(9) ols_clusters = PanelData(clusters).values3d.mean(1).T.astype(np.int32) - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) - ols_res = ols.fit(cov_type='clustered', clusters=ols_clusters) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) + ols_res = ols.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) @@ -152,11 +151,11 @@ def test_multiple_obs_per_entity_weighted(data): exog = pd.DataFrame(wexog, columns=mod.exog.vars) ols = IV2SLS(dep, exog, None, None, weights=weights) - ols_res = ols.fit(cov_type='unadjusted') + ols_res = ols.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) - res = mod.fit(cov_type='robust', debiased=False) - ols_res = ols.fit(cov_type='robust') + res = mod.fit(cov_type="robust", debiased=False) + ols_res = ols.fit(cov_type="robust") assert_results_equal(res, ols_res) clusters = mod.dependent.dataframe.copy() @@ -166,8 +165,8 @@ def test_multiple_obs_per_entity_weighted(data): clusters.loc[entity] = np.random.randint(9) ols_clusters = PanelData(clusters).values3d.mean(1).T.astype(np.int32) - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) - ols_res = ols.fit(cov_type='clustered', clusters=ols_clusters) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) + ols_res = ols.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) @@ -184,20 +183,21 @@ def test_missing(missing_data): weights = weights.reindex(mod.dependent.entities) ols = IV2SLS(dep, exog, None, None, weights=weights) - ols_res = ols.fit(cov_type='unadjusted') + ols_res = ols.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) - res = mod.fit(reweight=True, cov_type='robust', debiased=False) - ols_res = ols.fit(cov_type='robust') + res = mod.fit(reweight=True, cov_type="robust", debiased=False) + ols_res = ols.fit(cov_type="robust") assert_results_equal(res, ols_res) vc1 = PanelData(missing_data.vc1) ols_clusters = vc1.dataframe.groupby(level=0).mean().astype(np.int32) ols_clusters = ols_clusters.reindex(mod.dependent.entities) - res = mod.fit(reweight=True, cov_type='clustered', - clusters=missing_data.vc1, debiased=False) - ols_res = ols.fit(cov_type='clustered', clusters=ols_clusters) + res = mod.fit( + reweight=True, cov_type="clustered", clusters=missing_data.vc1, debiased=False + ) + ols_res = ols.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) @@ -219,14 +219,14 @@ def test_missing_weighted(missing_data): exog = (1.0 / weights.values) * exog ols = IV2SLS(dep, exog, None, None, weights=weights) - ols_res = ols.fit(cov_type='unadjusted') + ols_res = ols.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) def test_unknown_covariance(data): mod = BetweenOLS(data.y, data.x) with pytest.raises(KeyError): - mod.fit(cov_type='unknown') + mod.fit(cov_type="unknown") def test_results_access(data): @@ -266,14 +266,14 @@ def test_2way_cluster(data): exog = mod.exog.dataframe.groupby(level=0).mean() clusters = mod.dependent.dataframe.copy() - clusters.columns = ['cluster.0'] - clusters['cluster.1'] = mod.dependent.dataframe.copy() + clusters.columns = ["cluster.0"] + clusters["cluster.1"] = mod.dependent.dataframe.copy() clusters.loc[:, :] = 0 clusters = clusters.astype(np.int32) for entity in mod.dependent.entities: clusters.loc[entity, :] = np.random.randint(33, size=(1, 2)) - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) dep = dep.reindex(list(res.resids.index)) exog = exog.reindex(list(res.resids.index)) @@ -282,7 +282,7 @@ def test_2way_cluster(data): ols_clusters = clusters.groupby(level=0).max() ols_clusters = ols_clusters.reindex(list(res.resids.index)) - ols_res = ols.fit(cov_type='clustered', clusters=ols_clusters) + ols_res = ols.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) @@ -296,7 +296,7 @@ def test_cluster_error(data): clusters.iloc[::7, :] = 0 with pytest.raises(ValueError): - mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + mod.fit(cov_type="clustered", clusters=clusters, debiased=False) def test_default_clusters(data): @@ -314,20 +314,23 @@ def test_default_clusters(data): x = x[:, [0]] y = y[:, [0]] mod = BetweenOLS(y, x) - res = mod.fit(reweight=True, cov_type='clustered', debiased=False) + res = mod.fit(reweight=True, cov_type="clustered", debiased=False) dep = mod.dependent.dataframe exog = mod.exog.dataframe ols = IV2SLS(dep, exog, None, None) - ols_res = ols.fit(cov_type='clustered') + ols_res = ols.fit(cov_type="clustered") assert_results_equal(res, ols_res) def test_fitted_effects_residuals(both_data_types): mod = BetweenOLS(both_data_types.y, both_data_types.x) res = mod.fit(reweight=True, debiased=False) - expected = pd.DataFrame(mod.exog.values2d @ res.params.values, mod.dependent.index, - columns=['fitted_values']) + expected = pd.DataFrame( + mod.exog.values2d @ res.params.values, + mod.dependent.index, + columns=["fitted_values"], + ) assert_allclose(expected, res.fitted_values) assert_frame_similar(res.fitted_values, expected) @@ -337,12 +340,12 @@ def test_fitted_effects_residuals(both_data_types): resids = resids.reindex(reindex) resids.index = index expected = pd.DataFrame(resids) - expected.columns = ['estimated_effects'] + expected.columns = ["estimated_effects"] assert_allclose(expected, res.estimated_effects) assert_frame_similar(res.estimated_effects, expected) fitted_effects = res.fitted_values.values + res.estimated_effects.values expected.iloc[:, 0] = mod.dependent.values2d - fitted_effects - expected.columns = ['idiosyncratic'] + expected.columns = ["idiosyncratic"] assert_allclose(expected, res.idiosyncratic, atol=1e-8) assert_frame_similar(res.idiosyncratic, expected) diff --git a/linearmodels/tests/panel/test_cluster_input_formats.py b/linearmodels/tests/panel/test_cluster_input_formats.py index 1d6d5fccb5..169b28d850 100644 --- a/linearmodels/tests/panel/test_cluster_input_formats.py +++ b/linearmodels/tests/panel/test_cluster_input_formats.py @@ -9,11 +9,13 @@ from linearmodels.panel.model import PanelOLS from linearmodels.tests.panel._utility import datatypes, generate_data -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) missing = [0.0, 0.20] perms = list(product(missing, datatypes)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) @@ -28,7 +30,7 @@ def test_categorical_input(data): effects = np.random.randint(0, 5, size=(nt, 2)) temp = {} for i, e in enumerate(effects.T): - name = 'effect.' + str(i) + name = "effect." + str(i) temp[name] = pd.Categorical(pd.Series(e, index=y.index, name=name)) effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) @@ -37,33 +39,39 @@ def test_categorical_input(data): clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} for i, c in enumerate(clusters.T): - name = 'effect.' + str(i) + name = "effect." + str(i) temp[name] = pd.Categorical(pd.Series(c, index=y.index, name=name)) clusters = pd.DataFrame(temp, index=y.index) - mod.fit(cov_type='clustered', clusters=clusters) + mod.fit(cov_type="clustered", clusters=clusters) def test_string_input(data): y = PanelData(data.y) nt = y.values2d.shape[0] temp = {} - prim = ['a', 'b', 'c', 'd', 'e'] + prim = ["a", "b", "c", "d", "e"] for i in range(2): - name = 'effect.' + str(i) - temp[name] = pd.Series(np.random.choice(prim, size=nt), index=y.index, name=name) + name = "effect." + str(i) + temp[name] = pd.Series( + np.random.choice(prim, size=nt), index=y.index, name=name + ) effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) mod.fit() clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} - prim = list(map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase)))) + prim = list( + map(lambda s: "".join(s), list(product(ascii_lowercase, ascii_lowercase))) + ) for i in range(clusters.shape[1]): - name = 'effect.' + str(i) - temp[name] = pd.Series(np.random.choice(prim, size=nt), index=y.index, name=name) + name = "effect." + str(i) + temp[name] = pd.Series( + np.random.choice(prim, size=nt), index=y.index, name=name + ) clusters = pd.DataFrame(temp, index=y.index) - mod.fit(cov_type='clustered', clusters=clusters) + mod.fit(cov_type="clustered", clusters=clusters) def test_integer_input(data): @@ -72,7 +80,7 @@ def test_integer_input(data): effects = np.random.randint(0, 5, size=(nt, 2)) temp = {} for i, e in enumerate(effects.T): - name = 'effect.' + str(i) + name = "effect." + str(i) temp[name] = pd.Series(e, index=y.index, name=name) effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) @@ -81,30 +89,34 @@ def test_integer_input(data): clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} for i, c in enumerate(clusters.T): - name = 'effect.' + str(i) + name = "effect." + str(i) temp[name] = pd.Series(c, index=y.index, name=name) clusters = pd.DataFrame(temp, index=y.index) - mod.fit(cov_type='clustered', clusters=clusters) + mod.fit(cov_type="clustered", clusters=clusters) def test_mixed_input(data): y = PanelData(data.y) nt = y.values2d.shape[0] effects = np.random.randint(0, 5, size=nt) - prim = ['a', 'b', 'c', 'd', 'e'] - temp = {'effect.0': pd.Categorical(pd.Series(effects, index=y.index)), - 'effect.1': pd.Series(np.random.choice(prim, size=nt), index=y.index)} + prim = ["a", "b", "c", "d", "e"] + temp = { + "effect.0": pd.Categorical(pd.Series(effects, index=y.index)), + "effect.1": pd.Series(np.random.choice(prim, size=nt), index=y.index), + } effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) mod.fit() clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} - prim = list(map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase)))) - temp['var.cluster.0'] = pd.Series(np.random.choice(prim, size=nt), index=y.index) - temp['var.cluster.1'] = pd.Series(clusters[:, 1], index=y.index) + prim = list( + map(lambda s: "".join(s), list(product(ascii_lowercase, ascii_lowercase))) + ) + temp["var.cluster.0"] = pd.Series(np.random.choice(prim, size=nt), index=y.index) + temp["var.cluster.1"] = pd.Series(clusters[:, 1], index=y.index) clusters = pd.DataFrame(temp, index=y.index) - mod.fit(cov_type='clustered', clusters=clusters) + mod.fit(cov_type="clustered", clusters=clusters) def test_nested_effects(data): @@ -112,18 +124,18 @@ def test_nested_effects(data): effects = pd.DataFrame(y.entity_ids // 2, index=y.index) with pytest.raises(ValueError) as exception: PanelOLS(data.y, data.x, entity_effects=True, other_effects=effects) - assert 'entity effects' in str(exception.value) + assert "entity effects" in str(exception.value) effects = pd.DataFrame(y.time_ids // 2, index=y.index) with pytest.raises(ValueError) as exception: PanelOLS(data.y, data.x, time_effects=True, other_effects=effects) - assert 'time effects' in str(exception.value) + assert "time effects" in str(exception.value) effects1 = pd.Series(y.entity_ids.squeeze() // 2, index=y.index) effects2 = pd.Series(y.entity_ids.squeeze() // 4, index=y.index) - effects = pd.DataFrame({'eff1': effects1, 'eff2': effects2}) + effects = pd.DataFrame({"eff1": effects1, "eff2": effects2}) with pytest.raises(ValueError) as exception: PanelOLS(data.y, data.x, other_effects=effects) - assert 'by other effects' in str(exception.value) - assert 'time effects' not in str(exception.value) - assert 'entity effects' not in str(exception.value) + assert "by other effects" in str(exception.value) + assert "time effects" not in str(exception.value) + assert "entity effects" not in str(exception.value) diff --git a/linearmodels/tests/panel/test_data.py b/linearmodels/tests/panel/test_data.py index c67f547d1b..c5b36ded7e 100644 --- a/linearmodels/tests/panel/test_data.py +++ b/linearmodels/tests/panel/test_data.py @@ -1,13 +1,13 @@ from linearmodels.compat.numpy import lstsq from linearmodels.compat.pandas import get_codes, is_string_dtype -from itertools import product from datetime import datetime +from itertools import product import numpy as np from numpy.linalg import pinv from numpy.testing import assert_allclose, assert_equal -from pandas import (Categorical, DataFrame, Series, date_range, get_dummies) +from pandas import Categorical, DataFrame, Series, date_range, get_dummies from pandas.testing import assert_frame_equal, assert_index_equal import pytest @@ -23,15 +23,23 @@ pass -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) PERC_MISSING = [0, 0.02, 0.10, 0.33] TYPES = datatypes -@pytest.fixture(params=list(product(PERC_MISSING, TYPES)), - ids=list(map(lambda x: str(int(100 * x[0])) + '-' + str(x[1]), - product(PERC_MISSING, TYPES)))) +@pytest.fixture( + params=list(product(PERC_MISSING, TYPES)), + ids=list( + map( + lambda x: str(int(100 * x[0])) + "-" + str(x[1]), + product(PERC_MISSING, TYPES), + ) + ), +) def data(request): missing, datatype = request.param return generate_data(missing, datatype, ntk=(231, 7, 5)) @@ -42,9 +50,9 @@ def mi_df(): np.random.seed(12345) n, t, k = 11, 7, 3 x = np.random.standard_normal((k, t, n)) - major = date_range('12-31-1999', periods=7) - items = ['var.{0}'.format(i) for i in range(1, k + 1)] - minor = ['entities.{0}'.format(i) for i in range(1, n + 1)] + major = date_range("12-31-1999", periods=7) + items = ["var.{0}".format(i) for i in range(1, k + 1)] + minor = ["entities.{0}".format(i) for i in range(1, n + 1)] return panel_to_frame(x, items, major, minor, swap=True) @@ -57,12 +65,17 @@ def test_numpy_3d(): assert dh.nobs == t assert dh.nvar == k assert_equal(np.reshape(x.T, (n * t, k)), dh.values2d) - items = ['entity.{0}'.format(i) for i in range(n)] + items = ["entity.{0}".format(i) for i in range(n)] obs = [i for i in range(t)] - var_names = ['x.{0}'.format(i) for i in range(k)] - expected_frame = panel_to_frame(np.reshape(x, (k, t, n)), items=var_names, - major_axis=obs, minor_axis=items, swap=True) - expected_frame.index.set_names(['entity', 'time'], inplace=True) + var_names = ["x.{0}".format(i) for i in range(k)] + expected_frame = panel_to_frame( + np.reshape(x, (k, t, n)), + items=var_names, + major_axis=obs, + minor_axis=items, + swap=True, + ) + expected_frame.index.set_names(["entity", "time"], inplace=True) assert_frame_equal(dh.dataframe, expected_frame) @@ -87,9 +100,9 @@ def test_numpy_2d(): def test_pandas_multiindex_dataframe(): n, t, k = 11, 7, 3 x = np.random.random((n, t, k)) - major = date_range('12-31-1999', periods=7) - minor = ['var.{0}'.format(i) for i in range(1, k + 1)] - items = ['item.{0}'.format(i) for i in range(1, n + 1)] + major = date_range("12-31-1999", periods=7) + minor = ["var.{0}".format(i) for i in range(1, k + 1)] + items = ["item.{0}".format(i) for i in range(1, n + 1)] x = panel_to_frame(x, items=items, major_axis=major, minor_axis=minor, swap=True) PanelData(x) @@ -97,8 +110,8 @@ def test_pandas_multiindex_dataframe(): def test_pandas_dataframe(): t, n = 11, 7 x = np.random.random((t, n)) - index = date_range('12-31-1999', periods=t) - cols = ['entity.{0}'.format(i) for i in range(1, n + 1)] + index = date_range("12-31-1999", periods=t) + cols = ["entity.{0}".format(i) for i in range(1, n + 1)] x = DataFrame(x, columns=cols, index=index) PanelData(x) @@ -106,34 +119,40 @@ def test_pandas_dataframe(): def test_existing_panel_data(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) - major = date_range('12-31-1999', periods=7) - items = ['var.{0}'.format(i) for i in range(1, k + 1)] - minor = ['entities.{0}'.format(i) for i in range(1, n + 1)] + major = date_range("12-31-1999", periods=7) + items = ["var.{0}".format(i) for i in range(1, k + 1)] + minor = ["entities.{0}".format(i) for i in range(1, n + 1)] x = panel_to_frame(x, items=items, major_axis=major, minor_axis=minor, swap=True) dh = PanelData(x) dh2 = PanelData(dh) assert_frame_equal(dh.dataframe, dh2.dataframe) -@pytest.mark.skipif(MISSING_XARRAY, reason='xarray is not installed') +@pytest.mark.skipif(MISSING_XARRAY, reason="xarray is not installed") def test_xarray_2d(): n, t = 11, 7 x = np.random.random((t, n)) - x = xr.DataArray(x, dims=('time', 'entity'), - coords={ - 'entity': list('firm.' + str(i) for i in range(n))}) + x = xr.DataArray( + x, + dims=("time", "entity"), + coords={"entity": list("firm." + str(i) for i in range(n))}, + ) dh = PanelData(x) assert_equal(dh.values2d, np.reshape(x.values.T, (n * t, 1))) -@pytest.mark.skipif(MISSING_XARRAY, reason='xarray is not installed') +@pytest.mark.skipif(MISSING_XARRAY, reason="xarray is not installed") def test_xarray_3d(): n, t, k = 11, 7, 13 x = np.random.random((k, t, n)) - x = xr.DataArray(x, dims=('var', 'time', 'entity'), - coords={ - 'entity': list('firm.' + str(i) for i in range(n)), - 'var': list('x.' + str(i) for i in range(k))}) + x = xr.DataArray( + x, + dims=("var", "time", "entity"), + coords={ + "entity": list("firm." + str(i) for i in range(n)), + "var": list("x." + str(i) for i in range(k)), + }, + ) dh = PanelData(x) assert_equal(np.reshape(x.values.T, (n * t, k)), dh.values2d) @@ -172,11 +191,15 @@ def test_missing(mi_df): def test_incorrect_dataframe(): grouped = np.array(list([i] * 10 for i in range(10))).ravel() - df = DataFrame({'a': np.arange(100), - 'b': grouped, - 'c': np.random.permutation(grouped), - 'data': np.random.randn(100)}) - df = df.set_index(['a', 'b', 'c']) + df = DataFrame( + { + "a": np.arange(100), + "b": grouped, + "c": np.random.permutation(grouped), + "data": np.random.randn(100), + } + ) + df = df.set_index(["a", "b", "c"]) with pytest.raises(ValueError): PanelData(df) @@ -186,7 +209,7 @@ def test_incorrect_types(): PanelData(list(np.random.randn(10))) -@pytest.mark.skipif(MISSING_XARRAY, reason='xarray is not installed') +@pytest.mark.skipif(MISSING_XARRAY, reason="xarray is not installed") def test_incorrect_types_xarray(): with pytest.raises(ValueError): PanelData(xr.DataArray(np.random.randn(10))) @@ -198,8 +221,8 @@ def test_ids(mi_df): assert eids.shape == (77, 1) assert len(np.unique(eids)) == 11 for i in range(0, len(eids), 7): - assert np.ptp(eids[i:i + 7]) == 0 - assert np.all((eids[i + 8:] - eids[i]) != 0) + assert np.ptp(eids[i : i + 7]) == 0 + assert np.all((eids[i + 8 :] - eids[i]) != 0) tids = data.time_ids assert tids.shape == (77, 1) @@ -210,19 +233,19 @@ def test_ids(mi_df): def test_str_repr(mi_df): data = PanelData(mi_df) - assert 'PanelData' in str(data) + assert "PanelData" in str(data) assert str(hex(id(data))) in data.__repr__() def test_demean(mi_df): data = PanelData(mi_df) - fe = data.demean('entity') + fe = data.demean("entity") expected = data.values3d.copy() for i in range(3): expected[i] -= expected[i].mean(0) assert_allclose(fe.values3d, expected) - te = data.demean('time') + te = data.demean("time") expected = data.values3d.copy() for i in range(3): expected[i] -= expected[i].mean(1)[:, None] @@ -237,11 +260,11 @@ def demean(x): return x - x.mean() entity_demean = df.groupby(level=0).transform(demean) - res = dh.demean('entity') + res = dh.demean("entity") assert_allclose(entity_demean.values, res.values2d) time_demean = df.groupby(level=1).transform(demean) - res = dh.demean('time') + res = dh.demean("time") assert_allclose(time_demean.values, res.values2d) @@ -255,16 +278,14 @@ def test_demean_against_dummy_regression(data): cat = Categorical(no_index[df.index.levels[0].name]) d = get_dummies(cat, drop_first=False).astype(np.float64) dummy_demeaned = df.values - d @ lstsq(d, df.values)[0] - entity_demean = dh.demean('entity') - assert_allclose(1 + np.abs(entity_demean.values2d), - 1 + np.abs(dummy_demeaned)) + entity_demean = dh.demean("entity") + assert_allclose(1 + np.abs(entity_demean.values2d), 1 + np.abs(dummy_demeaned)) cat = Categorical(no_index[df.index.levels[1].name]) d = get_dummies(cat, drop_first=False).astype(np.float64) dummy_demeaned = df.values - d @ lstsq(d, df.values)[0] - time_demean = dh.demean('time') - assert_allclose(1 + np.abs(time_demean.values2d), - 1 + np.abs(dummy_demeaned)) + time_demean = dh.demean("time") + assert_allclose(1 + np.abs(time_demean.values2d), 1 + np.abs(dummy_demeaned)) cat = Categorical(no_index[df.index.levels[0].name]) d1 = get_dummies(cat, drop_first=False).astype(np.float64) @@ -272,21 +293,20 @@ def test_demean_against_dummy_regression(data): d2 = get_dummies(cat, drop_first=True).astype(np.float64) d = np.c_[d1.values, d2.values] dummy_demeaned = df.values - d @ lstsq(d, df.values)[0] - both_demean = dh.demean('both') - assert_allclose(1 + np.abs(both_demean.values2d), - 1 + np.abs(dummy_demeaned)) + both_demean = dh.demean("both") + assert_allclose(1 + np.abs(both_demean.values2d), 1 + np.abs(dummy_demeaned)) def test_demean_missing(mi_df): mi_df.values.flat[::13] = np.nan data = PanelData(mi_df) - fe = data.demean('entity') + fe = data.demean("entity") expected = data.values3d.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 0) assert_allclose(fe.values3d, expected) - te = data.demean('time') + te = data.demean("time") expected = data.values3d.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 1)[:, None] @@ -305,7 +325,7 @@ def test_demean_many_missing(mi_df): mi_df.loc[time, column] = np.nan mi_df.index = mi_df.index.swaplevel() data = PanelData(mi_df) - fe = data.demean('entity') + fe = data.demean("entity") orig_nan = np.isnan(data.values3d.ravel()) fe_nan = np.isnan(fe.values3d.ravel()) assert np.all(fe_nan[orig_nan]) @@ -318,7 +338,7 @@ def test_demean_many_missing(mi_df): expected[i] -= mu assert_allclose(fe.values3d, expected) - te = data.demean('time') + te = data.demean("time") expected = data.values3d.copy() for i in range(3): mu = np.ones((expected[i].shape[0], 1)) * np.nan @@ -342,7 +362,7 @@ def test_demean_many_missing_dropped(mi_df): data = PanelData(mi_df) data.drop(data.isnull) - fe = data.demean('entity') + fe = data.demean("entity") expected = data.values2d.copy() eid = data.entity_ids.ravel() @@ -354,11 +374,11 @@ def test_demean_many_missing_dropped(mi_df): def test_demean_both_large_t(): x = np.random.standard_normal((1, 100, 10)) - time = date_range('1-1-2000', periods=100) - entities = ['entity.{0}'.format(i) for i in range(10)] - data = panel_to_frame(x, ['x'], time, entities, swap=True) + time = date_range("1-1-2000", periods=100) + entities = ["entity.{0}".format(i) for i in range(10)] + data = panel_to_frame(x, ["x"], time, entities, swap=True) data = PanelData(data) - demeaned = data.demean('both') + demeaned = data.demean("both") df = data.dataframe no_index = df.reset_index() @@ -368,14 +388,13 @@ def test_demean_both_large_t(): d2 = get_dummies(cat, drop_first=True).astype(np.float64) d = np.c_[d1.values, d2.values] dummy_demeaned = df.values - d @ pinv(d) @ df.values - assert_allclose(1 + np.abs(demeaned.values2d), - 1 + np.abs(dummy_demeaned)) + assert_allclose(1 + np.abs(demeaned.values2d), 1 + np.abs(dummy_demeaned)) def test_demean_invalid(mi_df): data = PanelData(mi_df) with pytest.raises(ValueError): - data.demean('unknown') + data.demean("unknown") def test_dummies(mi_df): @@ -383,14 +402,14 @@ def test_dummies(mi_df): edummy = data.dummies() assert edummy.shape == (77, 11) assert np.all(edummy.sum(0) == 7) - tdummy = data.dummies(group='time') + tdummy = data.dummies(group="time") assert tdummy.shape == (77, 7) assert np.all(tdummy.sum(0) == 11) - tdummy_drop = data.dummies(group='time', drop_first=True) + tdummy_drop = data.dummies(group="time", drop_first=True) assert tdummy_drop.shape == (77, 6) assert np.all(tdummy.sum(0) == 11) with pytest.raises(ValueError): - data.dummies('unknown') + data.dummies("unknown") def test_roundtrip_3d(data): @@ -419,29 +438,32 @@ def test_demean_missing_alt_types(data): check = isinstance(data.x, (DataFrame, np.ndarray)) xpd = PanelData(data.x) xpd.drop(xpd.isnull) - entity_demean = xpd.demean('entity') + entity_demean = xpd.demean("entity") expected = xpd.dataframe.groupby(level=0).transform(lambda s: s - s.mean()) - assert_frame_equal(entity_demean.dataframe, expected, - check_index_type=check, - check_column_type=check) - - time_demean = xpd.demean('time') + assert_frame_equal( + entity_demean.dataframe, + expected, + check_index_type=check, + check_column_type=check, + ) + + time_demean = xpd.demean("time") expected = xpd.dataframe.groupby(level=1).transform(lambda s: s - s.mean()) - assert_frame_equal(time_demean.dataframe, expected, - check_index_type=check, - check_column_type=check) + assert_frame_equal( + time_demean.dataframe, expected, check_index_type=check, check_column_type=check + ) def test_mean_missing(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) - entity_mean = xpd.mean('entity') + entity_mean = xpd.mean("entity") expected = xpd.dataframe.groupby(level=0).mean() expected = expected.loc[xpd.entities] expected.columns.name = None assert_frame_equal(entity_mean, expected) - time_mean = xpd.mean('time') + time_mean = xpd.mean("time") expected = xpd.dataframe.groupby(level=1).mean() expected = expected.loc[xpd.time] expected.columns.name = None @@ -451,14 +473,14 @@ def test_mean_missing(data): def test_count(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) - entity_mean = xpd.count('entity') + entity_mean = xpd.count("entity") expected = xpd.dataframe.groupby(level=0).count() expected = expected.loc[xpd.entities] expected.columns.name = None expected = expected.astype(np.int64) assert_frame_equal(entity_mean, expected) - time_mean = xpd.count('time') + time_mean = xpd.count("time") expected = xpd.dataframe.groupby(level=1).count() expected = expected.loc[xpd.time] expected.columns.name = None @@ -478,15 +500,17 @@ def test_demean_simple_weighted(data): x.drop(missing) w.drop(missing) w.dataframe.iloc[:, 0] = 1 - unweighted_entity_demean = x.demean('entity') - weighted_entity_demean = x.demean('entity', weights=w) - assert_allclose(unweighted_entity_demean.dataframe, - weighted_entity_demean.dataframe) + unweighted_entity_demean = x.demean("entity") + weighted_entity_demean = x.demean("entity", weights=w) + assert_allclose( + unweighted_entity_demean.dataframe, weighted_entity_demean.dataframe + ) - unweighted_entity_demean = x.demean('time') - weighted_entity_demean = x.demean('time', weights=w) - assert_allclose(unweighted_entity_demean.dataframe, - weighted_entity_demean.dataframe) + unweighted_entity_demean = x.demean("time") + weighted_entity_demean = x.demean("time", weights=w) + assert_allclose( + unweighted_entity_demean.dataframe, weighted_entity_demean.dataframe + ) def test_demean_weighted(data): @@ -496,7 +520,7 @@ def test_demean_weighted(data): x.drop(missing) w.drop(missing) - entity_demean = x.demean('entity', weights=w) + entity_demean = x.demean("entity", weights=w) d = get_dummies(Categorical(get_codes(x.index)[0])) d = d.values root_w = np.sqrt(w.values2d) @@ -504,10 +528,9 @@ def test_demean_weighted(data): wd = d * root_w mu = wd @ lstsq(wd, wx)[0] e = wx - mu - assert_allclose(1 + np.abs(entity_demean.values2d), - 1 + np.abs(e)) + assert_allclose(1 + np.abs(entity_demean.values2d), 1 + np.abs(e)) - time_demean = x.demean('time', weights=w) + time_demean = x.demean("time", weights=w) d = get_dummies(Categorical(get_codes(x.index)[1])) d = d.values root_w = np.sqrt(w.values2d) @@ -515,8 +538,7 @@ def test_demean_weighted(data): wd = d * root_w mu = wd @ lstsq(wd, wx)[0] e = wx - mu - assert_allclose(1 + np.abs(time_demean.values2d), - 1 + np.abs(e)) + assert_allclose(1 + np.abs(time_demean.values2d), 1 + np.abs(e)) def test_mean_weighted(data): @@ -525,7 +547,7 @@ def test_mean_weighted(data): missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) - entity_mean = x.mean('entity', weights=w) + entity_mean = x.mean("entity", weights=w) c = x.index.levels[0][get_codes(x.index)[0]] d = get_dummies(Categorical(c, ordered=True)) d = d[entity_mean.index] @@ -536,7 +558,7 @@ def test_mean_weighted(data): mu = lstsq(wd, wx)[0] assert_allclose(entity_mean, mu) - time_mean = x.mean('time', weights=w) + time_mean = x.mean("time", weights=w) c = x.index.levels[1][get_codes(x.index)[1]] d = get_dummies(Categorical(c, ordered=True)) d = d[list(time_mean.index)] @@ -550,91 +572,94 @@ def test_mean_weighted(data): def test_categorical_conversion(): t, n = 3, 1000 - string = np.random.choice(['a', 'b', 'c'], (t, n)) + string = np.random.choice(["a", "b", "c"], (t, n)) num = np.random.randn(t, n) - time = date_range('1-1-2000', periods=t) - entities = ['entity.{0}'.format(i) for i in range(n)] - p = panel_to_frame(None, items=['a', 'b'], major_axis=time, - minor_axis=entities, swap=True) - p['a'] = string.T.ravel() - p['b'] = num.T.ravel() - p = p[['a', 'b']] + time = date_range("1-1-2000", periods=t) + entities = ["entity.{0}".format(i) for i in range(n)] + p = panel_to_frame( + None, items=["a", "b"], major_axis=time, minor_axis=entities, swap=True + ) + p["a"] = string.T.ravel() + p["b"] = num.T.ravel() + p = p[["a", "b"]] panel = PanelData(p, convert_dummies=False) df = panel.dataframe.copy() - df['a'] = Categorical(df['a']) + df["a"] = Categorical(df["a"]) panel = PanelData(df, convert_dummies=True) df = panel.dataframe assert df.shape == (3000, 3) s = string.T.ravel() - a_locs = np.where(s == 'a') - b_locs = np.where(s == 'b') - c_locs = np.where(s == 'c') - assert np.all(df.loc[:, 'a.b'].values[a_locs] == 0.0) - assert np.all(df.loc[:, 'a.b'].values[b_locs] == 1.0) - assert np.all(df.loc[:, 'a.b'].values[c_locs] == 0.0) + a_locs = np.where(s == "a") + b_locs = np.where(s == "b") + c_locs = np.where(s == "c") + assert np.all(df.loc[:, "a.b"].values[a_locs] == 0.0) + assert np.all(df.loc[:, "a.b"].values[b_locs] == 1.0) + assert np.all(df.loc[:, "a.b"].values[c_locs] == 0.0) - assert np.all(df.loc[:, 'a.c'].values[a_locs] == 0.0) - assert np.all(df.loc[:, 'a.c'].values[b_locs] == 0.0) - assert np.all(df.loc[:, 'a.c'].values[c_locs] == 1.0) + assert np.all(df.loc[:, "a.c"].values[a_locs] == 0.0) + assert np.all(df.loc[:, "a.c"].values[b_locs] == 0.0) + assert np.all(df.loc[:, "a.c"].values[c_locs] == 1.0) def test_string_conversion(): t, n = 3, 1000 - string = np.random.choice(['a', 'b', 'c'], (t, n)) + string = np.random.choice(["a", "b", "c"], (t, n)) num = np.random.randn(t, n) - time = date_range('1-1-2000', periods=t) - entities = ['entity.{0}'.format(i) for i in range(n)] - p = panel_to_frame(None, items=['a', 'b'], major_axis=time, minor_axis=entities, - swap=True) - p['a'] = string.T.ravel() - p['b'] = num.T.ravel() - p = p[['a', 'b']] - panel = PanelData(p, var_name='OtherEffect') + time = date_range("1-1-2000", periods=t) + entities = ["entity.{0}".format(i) for i in range(n)] + p = panel_to_frame( + None, items=["a", "b"], major_axis=time, minor_axis=entities, swap=True + ) + p["a"] = string.T.ravel() + p["b"] = num.T.ravel() + p = p[["a", "b"]] + panel = PanelData(p, var_name="OtherEffect") df = panel.dataframe assert df.shape == (3000, 3) s = string.T.ravel() - a_locs = np.where(s == 'a') - b_locs = np.where(s == 'b') - c_locs = np.where(s == 'c') - assert np.all(df.loc[:, 'a.b'].values[a_locs] == 0.0) - assert np.all(df.loc[:, 'a.b'].values[b_locs] == 1.0) - assert np.all(df.loc[:, 'a.b'].values[c_locs] == 0.0) + a_locs = np.where(s == "a") + b_locs = np.where(s == "b") + c_locs = np.where(s == "c") + assert np.all(df.loc[:, "a.b"].values[a_locs] == 0.0) + assert np.all(df.loc[:, "a.b"].values[b_locs] == 1.0) + assert np.all(df.loc[:, "a.b"].values[c_locs] == 0.0) - assert np.all(df.loc[:, 'a.c'].values[a_locs] == 0.0) - assert np.all(df.loc[:, 'a.c'].values[b_locs] == 0.0) - assert np.all(df.loc[:, 'a.c'].values[c_locs] == 1.0) + assert np.all(df.loc[:, "a.c"].values[a_locs] == 0.0) + assert np.all(df.loc[:, "a.c"].values[b_locs] == 0.0) + assert np.all(df.loc[:, "a.c"].values[c_locs] == 1.0) def test_string_nonconversion(): t, n = 3, 1000 - string = np.random.choice(['a', 'b', 'c'], (t, n)) + string = np.random.choice(["a", "b", "c"], (t, n)) num = np.random.randn(t, n) - time = date_range('1-1-2000', periods=t) - entities = ['entity.{0}'.format(i) for i in range(n)] - p = panel_to_frame(None, items=['a', 'b'], major_axis=time, minor_axis=entities, - swap=True) - p['a'] = string.T.ravel() - p['b'] = num.T.ravel() - panel = PanelData(p, var_name='OtherEffect', convert_dummies=False) - assert is_string_dtype(panel.dataframe['a'].dtype) - assert np.all(panel.dataframe['a'] == string.T.ravel()) + time = date_range("1-1-2000", periods=t) + entities = ["entity.{0}".format(i) for i in range(n)] + p = panel_to_frame( + None, items=["a", "b"], major_axis=time, minor_axis=entities, swap=True + ) + p["a"] = string.T.ravel() + p["b"] = num.T.ravel() + panel = PanelData(p, var_name="OtherEffect", convert_dummies=False) + assert is_string_dtype(panel.dataframe["a"].dtype) + assert np.all(panel.dataframe["a"] == string.T.ravel()) def test_repr_html(mi_df): data = PanelData(mi_df) html = data._repr_html_() - assert '
' in html + assert "
" in html def test_general_demean_oneway(mi_df): y = PanelData(mi_df) - dm1 = y.demean('entity') + dm1 = y.demean("entity") g = DataFrame(y.entity_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) - dm1 = y.demean('time') + dm1 = y.demean("time") g = DataFrame(y.time_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) @@ -649,9 +674,9 @@ def test_general_demean_oneway(mi_df): def test_general_demean_twoway(mi_df): y = PanelData(mi_df) - dm1 = y.demean('both') + dm1 = y.demean("both") g = DataFrame(y.entity_ids, index=y.index) - g['column2'] = Series(y.time_ids.squeeze(), index=y.index) + g["column2"] = Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) @@ -668,7 +693,7 @@ def test_general_demean_twoway(mi_df): def test_general_unit_weighted_demean_oneway(mi_df): y = PanelData(mi_df) - dm1 = y.demean('entity') + dm1 = y.demean("entity") g = PanelData(DataFrame(y.entity_ids, index=y.index)) weights = PanelData(g).copy() weights.dataframe.iloc[:, :] = 1 @@ -677,15 +702,14 @@ def test_general_unit_weighted_demean_oneway(mi_df): dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) - dm1 = y.demean('time') + dm1 = y.demean("time") g = PanelData(DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, weights) assert_allclose(dm1.values2d, dm2.values2d) dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) - g = PanelData(DataFrame(np.random.randint(0, 10, g.dataframe.shape), - index=y.index)) + g = PanelData(DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, weights) dm3 = y.general_demean(g) g = Categorical(g.dataframe.iloc[:, 0]) @@ -698,21 +722,21 @@ def test_general_unit_weighted_demean_oneway(mi_df): def test_general_weighted_demean_oneway(mi_df): y = PanelData(mi_df) weights = DataFrame( - np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) + np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index + ) w = PanelData(weights) - dm1 = y.demean('entity', weights=w) + dm1 = y.demean("entity", weights=w) g = PanelData(DataFrame(y.entity_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) - dm1 = y.demean('time', weights=w) + dm1 = y.demean("time", weights=w) g = PanelData(DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) - g = PanelData(DataFrame(np.random.randint(0, 10, g.dataframe.shape), - index=y.index)) + g = PanelData(DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, w) g = Categorical(g.dataframe.iloc[:, 0]) d = get_dummies(g) @@ -726,15 +750,15 @@ def test_general_unit_weighted_demean_twoway(mi_df): np.random.seed(12345) y = PanelData(mi_df) weights = DataFrame( - np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) + np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index + ) w = PanelData(weights) - dm1 = y.demean('both', weights=w) + dm1 = y.demean("both", weights=w) g = DataFrame(y.entity_ids, index=y.index) - g['column2'] = Series(y.time_ids.squeeze(), index=y.index) + g["column2"] = Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g, weights=w) - assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d), - atol=1e-7) + assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7) g = DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g, weights=w) @@ -761,8 +785,8 @@ def test_original_unmodified(data): mi_df_y = PanelData(data.y).dataframe mi_df_x = PanelData(data.x).dataframe - mi_df_y.index.names = ['firm', 'period'] - mi_df_x.index.names = ['firm', 'period'] + mi_df_y.index.names = ["firm", "period"] + mi_df_x.index.names = ["firm", "period"] mi_df_w = PanelData(data.w).dataframe pre_y = mi_df_y.copy() pre_x = mi_df_x.copy() @@ -784,38 +808,44 @@ def test_original_unmodified(data): def test_incorrect_time_axis(): x = np.random.randn(3, 3, 1000) - entities = ['entity.{0}'.format(i) for i in range(1000)] - time = ['time.{0}'.format(i) for i in range(3)] - var_names = ['var.{0}'.format(i) for i in range(3)] - p = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, - swap=True) + entities = ["entity.{0}".format(i) for i in range(1000)] + time = ["time.{0}".format(i) for i in range(3)] + var_names = ["var.{0}".format(i) for i in range(3)] + p = panel_to_frame( + x, items=var_names, major_axis=time, minor_axis=entities, swap=True + ) with pytest.raises(ValueError): PanelData(p) time = [1, 2, 3] - var_names = ['var.{0}'.format(i) for i in range(3)] - p = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, - swap=True) - p.index = p.index.set_levels([1, datetime(1960, 1, 1), 'a'], 1) + var_names = ["var.{0}".format(i) for i in range(3)] + p = panel_to_frame( + x, items=var_names, major_axis=time, minor_axis=entities, swap=True + ) + p.index = p.index.set_levels([1, datetime(1960, 1, 1), "a"], 1) with pytest.raises(ValueError): PanelData(p) -@pytest.mark.skipif(MISSING_XARRAY, reason='xarray is not installed') +@pytest.mark.skipif(MISSING_XARRAY, reason="xarray is not installed") def test_incorrect_time_axis_xarray(): x = np.random.randn(3, 3, 1000) - entities = ['entity.{0}'.format(i) for i in range(1000)] - time = ['time.{0}'.format(i) for i in range(3)] - vars = ['x.{0}'.format(i) for i in range(3)] - da = xr.DataArray(x, coords={'entities': entities, 'time': time, - 'vars': vars}, - dims=['vars', 'time', 'entities']) + entities = ["entity.{0}".format(i) for i in range(1000)] + time = ["time.{0}".format(i) for i in range(3)] + vars = ["x.{0}".format(i) for i in range(3)] + da = xr.DataArray( + x, + coords={"entities": entities, "time": time, "vars": vars}, + dims=["vars", "time", "entities"], + ) with pytest.raises(ValueError): PanelData(da) - da = xr.DataArray(x, coords={'entities': entities, 'time': time, - 'vars': vars}, - dims=['vars', 'time', 'entities']) + da = xr.DataArray( + x, + coords={"entities": entities, "time": time, "vars": vars}, + dims=["vars", "time", "entities"], + ) with pytest.raises(ValueError): PanelData(da) @@ -829,8 +859,8 @@ def test_named_index(data): data.x.index.set_names([None, None], inplace=True) pdata = PanelData(data.x) - assert pdata.dataframe.index.levels[0].name == 'entity' - assert pdata.dataframe.index.levels[1].name == 'time' + assert pdata.dataframe.index.levels[0].name == "entity" + assert pdata.dataframe.index.levels[1].name == "time" def test_fake_panel_properties(mi_df): diff --git a/linearmodels/tests/panel/test_fama_macbeth.py b/linearmodels/tests/panel/test_fama_macbeth.py index e1b99b6fc5..1b0d5f4f09 100644 --- a/linearmodels/tests/panel/test_fama_macbeth.py +++ b/linearmodels/tests/panel/test_fama_macbeth.py @@ -15,18 +15,22 @@ from linearmodels.utility import (InferenceUnavailableWarning, MissingValueWarning) -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) missing = [0.0, 0.20] has_const = [True, False] perms = list(product(missing, datatypes, has_const)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) def data(request): missing, datatype, const = request.param - return generate_data(missing, datatype, const=const, other_effects=1, ntk=(25, 200, 5)) + return generate_data( + missing, datatype, const=const, other_effects=1, ntk=(25, 200, 5) + ) def test_fama_macbeth(data): @@ -60,17 +64,17 @@ def test_fama_macbeth(data): def test_unknown_cov_type(data): with pytest.raises(ValueError): - FamaMacBeth(data.y, data.x).fit(cov_type='unknown') + FamaMacBeth(data.y, data.x).fit(cov_type="unknown") def test_fama_macbeth_kernel_smoke(data): - FamaMacBeth(data.y, data.x).fit(cov_type='kernel') - FamaMacBeth(data.y, data.x).fit(cov_type='kernel', kernel='bartlett') - FamaMacBeth(data.y, data.x).fit(cov_type='kernel', kernel='newey-west') - FamaMacBeth(data.y, data.x).fit(cov_type='kernel', kernel='parzen') - FamaMacBeth(data.y, data.x).fit(cov_type='kernel', kernel='qs') - FamaMacBeth(data.y, data.x).fit(cov_type='kernel', bandwidth=3) - res = FamaMacBeth(data.y, data.x).fit(cov_type='kernel', kernel='andrews') + FamaMacBeth(data.y, data.x).fit(cov_type="kernel") + FamaMacBeth(data.y, data.x).fit(cov_type="kernel", kernel="bartlett") + FamaMacBeth(data.y, data.x).fit(cov_type="kernel", kernel="newey-west") + FamaMacBeth(data.y, data.x).fit(cov_type="kernel", kernel="parzen") + FamaMacBeth(data.y, data.x).fit(cov_type="kernel", kernel="qs") + FamaMacBeth(data.y, data.x).fit(cov_type="kernel", bandwidth=3) + res = FamaMacBeth(data.y, data.x).fit(cov_type="kernel", kernel="andrews") access_attributes(res) @@ -79,30 +83,32 @@ def test_fitted_effects_residuals(data): res = mod.fit() expected = mod.exog.values2d @ res.params.values - expected = pd.DataFrame(expected, index=mod.exog.index, columns=['fitted_values']) + expected = pd.DataFrame(expected, index=mod.exog.index, columns=["fitted_values"]) assert_allclose(res.fitted_values, expected) assert_frame_similar(res.fitted_values, expected) expected.iloc[:, 0] = mod.dependent.values2d - expected.values - expected.columns = ['idiosyncratic'] + expected.columns = ["idiosyncratic"] assert_allclose(res.idiosyncratic, expected) assert_frame_similar(res.idiosyncratic, expected) expected.iloc[:, 0] = np.nan - expected.columns = ['estimated_effects'] + expected.columns = ["estimated_effects"] assert_allclose(res.estimated_effects, expected) assert_frame_similar(res.estimated_effects, expected) -@pytest.mark.filterwarnings('always::linearmodels.utility.MissingValueWarning') +@pytest.mark.filterwarnings("always::linearmodels.utility.MissingValueWarning") def test_block_size_warnings(): y = np.arange(12.0)[:, None] x = np.ones((12, 3)) x[:, 1] = np.arange(12.0) x[:, 2] = np.arange(12.0) ** 2 - idx = pd.MultiIndex.from_product([['a', 'b', 'c'], pd.date_range('2000-1-1', periods=4)]) - y = pd.DataFrame(y, index=idx, columns=['y']) - x = pd.DataFrame(x, index=idx, columns=['x1', 'x2', 'x3']) + idx = pd.MultiIndex.from_product( + [["a", "b", "c"], pd.date_range("2000-1-1", periods=4)] + ) + y = pd.DataFrame(y, index=idx, columns=["y"]) + x = pd.DataFrame(x, index=idx, columns=["x1", "x2", "x3"]) with pytest.warns(MissingValueWarning): FamaMacBeth(y.iloc[:11], x.iloc[:11]) with pytest.warns(InferenceUnavailableWarning): @@ -114,8 +120,10 @@ def test_block_size_error(): x = np.ones((12, 2)) x[1::4, 1] = 2 x[2::4, 1] = 3 - idx = pd.MultiIndex.from_product([['a', 'b', 'c'], pd.date_range('2000-1-1', periods=4)]) - y = pd.DataFrame(y, index=idx, columns=['y']) - x = pd.DataFrame(x, index=idx, columns=['x1', 'x2']) + idx = pd.MultiIndex.from_product( + [["a", "b", "c"], pd.date_range("2000-1-1", periods=4)] + ) + y = pd.DataFrame(y, index=idx, columns=["y"]) + x = pd.DataFrame(x, index=idx, columns=["x1", "x2"]) with pytest.raises(ValueError): FamaMacBeth(y, x) diff --git a/linearmodels/tests/panel/test_firstdifference_ols.py b/linearmodels/tests/panel/test_firstdifference_ols.py index 3886aeb054..0f45b41845 100644 --- a/linearmodels/tests/panel/test_firstdifference_ols.py +++ b/linearmodels/tests/panel/test_firstdifference_ols.py @@ -12,11 +12,13 @@ assert_results_equal, datatypes, generate_data) -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) missing = [0.0, 0.20] perms = list(product(missing, datatypes)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) @@ -32,16 +34,22 @@ def test_firstdifference_ols(data): y = mod.dependent.values3d x = mod.exog.values3d dy = np.array(y[0, 1:] - y[0, :-1]) - dy = pd.DataFrame(dy, index=mod.dependent.panel.major_axis[1:], - columns=mod.dependent.panel.minor_axis) + dy = pd.DataFrame( + dy, + index=mod.dependent.panel.major_axis[1:], + columns=mod.dependent.panel.minor_axis, + ) dy = dy.T.stack() dy = dy.reindex(mod.dependent.index) dx = x[:, 1:] - x[:, :-1] _dx = {} for i, dxi in enumerate(dx): - temp = pd.DataFrame(dxi, index=mod.dependent.panel.major_axis[1:], - columns=mod.dependent.panel.minor_axis) + temp = pd.DataFrame( + dxi, + index=mod.dependent.panel.major_axis[1:], + columns=mod.dependent.panel.minor_axis, + ) temp = temp.T.stack() temp = temp.reindex(mod.dependent.index) _dx[mod.exog.vars[i]] = temp @@ -54,30 +62,32 @@ def test_firstdifference_ols(data): dx = dx.loc[~drop] ols_mod = IV2SLS(dy, dx, None, None) - ols_res = ols_mod.fit(cov_type='unadjusted') + ols_res = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) - res = mod.fit(cov_type='robust', debiased=False) - ols_res = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", debiased=False) + ols_res = ols_mod.fit(cov_type="robust") assert_results_equal(res, ols_res) clusters = data.vc1 ols_clusters = mod.reformat_clusters(data.vc1) fd = mod.dependent.first_difference() ols_clusters = ols_clusters.dataframe.loc[fd.index] - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) - ols_res = ols_mod.fit(cov_type='clustered', clusters=ols_clusters) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) + ols_res = ols_mod.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) - res = mod.fit(cov_type='clustered', cluster_entity=True, debiased=False) + res = mod.fit(cov_type="clustered", cluster_entity=True, debiased=False) entity_clusters = mod.dependent.first_difference().entity_ids - ols_res = ols_mod.fit(cov_type='clustered', clusters=entity_clusters) + ols_res = ols_mod.fit(cov_type="clustered", clusters=entity_clusters) assert_results_equal(res, ols_res) - ols_clusters['entity.clusters'] = entity_clusters + ols_clusters["entity.clusters"] = entity_clusters ols_clusters = ols_clusters.astype(np.int32) - res = mod.fit(cov_type='clustered', cluster_entity=True, clusters=data.vc1, debiased=False) - ols_res = ols_mod.fit(cov_type='clustered', clusters=ols_clusters) + res = mod.fit( + cov_type="clustered", cluster_entity=True, clusters=data.vc1, debiased=False + ) + ols_res = ols_mod.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) @@ -88,16 +98,22 @@ def test_firstdifference_ols_weighted(data): y = mod.dependent.values3d x = mod.exog.values3d dy = np.array(y[0, 1:] - y[0, :-1]) - dy = pd.DataFrame(dy, index=mod.dependent.panel.major_axis[1:], - columns=mod.dependent.panel.minor_axis) + dy = pd.DataFrame( + dy, + index=mod.dependent.panel.major_axis[1:], + columns=mod.dependent.panel.minor_axis, + ) dy = dy.T.stack() dy = dy.reindex(mod.dependent.index) dx = x[:, 1:] - x[:, :-1] _dx = {} for i, dxi in enumerate(dx): - temp = pd.DataFrame(dxi, index=mod.dependent.panel.major_axis[1:], - columns=mod.dependent.panel.minor_axis) + temp = pd.DataFrame( + dxi, + index=mod.dependent.panel.major_axis[1:], + columns=mod.dependent.panel.minor_axis, + ) temp = temp.T.stack() temp = temp.reindex(mod.dependent.index) _dx[mod.exog.vars[i]] = temp @@ -109,8 +125,11 @@ def test_firstdifference_ols_weighted(data): w = mod.weights.values3d w = 1.0 / w sw = w[0, 1:] + w[0, :-1] - sw = pd.DataFrame(sw, index=mod.dependent.panel.major_axis[1:], - columns=mod.dependent.panel.minor_axis) + sw = pd.DataFrame( + sw, + index=mod.dependent.panel.major_axis[1:], + columns=mod.dependent.panel.minor_axis, + ) sw = sw.T.stack() sw = sw.reindex(mod.dependent.index) sw = 1.0 / sw @@ -122,11 +141,11 @@ def test_firstdifference_ols_weighted(data): sw = sw.loc[~drop] ols_mod = IV2SLS(dy, dx, None, None, weights=sw) - ols_res = ols_mod.fit(cov_type='unadjusted') + ols_res = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) - res = mod.fit(cov_type='robust', debiased=False) - ols_res = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", debiased=False) + ols_res = ols_mod.fit(cov_type="robust") assert_results_equal(res, ols_res) clusters = data.vc1 @@ -134,8 +153,8 @@ def test_firstdifference_ols_weighted(data): fd = mod.dependent.first_difference() ols_clusters = ols_clusters.dataframe.loc[fd.index] - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) - ols_res = ols_mod.fit(cov_type='clustered', clusters=ols_clusters) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) + ols_res = ols_mod.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) @@ -153,7 +172,7 @@ def test_first_difference_errors(data): if not isinstance(data.x, pd.DataFrame): return x = data.x.copy() - x['Intercept'] = 1.0 + x["Intercept"] = 1.0 with pytest.raises(ValueError): FirstDifferenceOLS(data.y, x) @@ -173,7 +192,7 @@ def test_firstdifference_error(data): clusters.iloc[::3, :] = clusters.iloc[::3, :] + 1 with pytest.raises(ValueError): - mod.fit(cov_type='clustered', clusters=clusters) + mod.fit(cov_type="clustered", clusters=clusters) def test_fitted_effects_residuals(data): @@ -181,16 +200,16 @@ def test_fitted_effects_residuals(data): res = mod.fit() expected = mod.exog.values2d @ res.params.values - expected = pd.DataFrame(expected, index=mod.exog.index, columns=['fitted_values']) + expected = pd.DataFrame(expected, index=mod.exog.index, columns=["fitted_values"]) assert_allclose(res.fitted_values, expected) assert_frame_similar(res.fitted_values, expected) expected.iloc[:, 0] = mod.dependent.values2d - expected.values - expected.columns = ['idiosyncratic'] + expected.columns = ["idiosyncratic"] assert_allclose(res.idiosyncratic, expected) assert_frame_similar(res.idiosyncratic, expected) expected.iloc[:, 0] = np.nan - expected.columns = ['estimated_effects'] + expected.columns = ["estimated_effects"] assert_allclose(res.estimated_effects, expected) assert_frame_similar(res.estimated_effects, expected) diff --git a/linearmodels/tests/panel/test_formula.py b/linearmodels/tests/panel/test_formula.py index cdd2a6fc18..4760ad0245 100644 --- a/linearmodels/tests/panel/test_formula.py +++ b/linearmodels/tests/panel/test_formula.py @@ -13,23 +13,29 @@ PanelOLS, PooledOLS, RandomEffects) from linearmodels.tests.panel._utility import datatypes, generate_data -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) PERC_MISSING = [0, 0.02, 0.10, 0.33] TYPES = datatypes -@pytest.fixture(params=list(product(PERC_MISSING, TYPES)), - ids=list(map(lambda x: str(int(100 * x[0])) + '-' + str(x[1]), - product(PERC_MISSING, TYPES)))) +@pytest.fixture( + params=list(product(PERC_MISSING, TYPES)), + ids=list( + map( + lambda x: str(int(100 * x[0])) + "-" + str(x[1]), + product(PERC_MISSING, TYPES), + ) + ), +) def data(request): missing, datatype = request.param return generate_data(missing, datatype, ntk=(91, 7, 5)) -@pytest.fixture(params=['y ~ x1 + x2', - 'y ~ x0 + x1 + x2 + x3 + x4 '], - scope='module') +@pytest.fixture(params=["y ~ x1 + x2", "y ~ x0 + x1 + x2 + x3 + x4 "], scope="module") def formula(request): return request.param @@ -56,7 +62,7 @@ def test_basic_formulas(data, models, formula): if not isinstance(data.y, DataFrame): return joined = data.x - joined['y'] = data.y + joined["y"] = data.y model, formula_func = models mod = model.from_formula(formula, joined) res = mod.fit() @@ -67,8 +73,8 @@ def test_basic_formulas(data, models, formula): res2 = mod2.fit() np.testing.assert_allclose(res.params, res2.params) - parts = formula.split('~') - vars = parts[1].replace(' 1 ', ' const ').split('+') + parts = formula.split("~") + vars = parts[1].replace(" 1 ", " const ").split("+") vars = list(map(lambda s: s.strip(), vars)) x = data.x res2 = model(data.y, x[vars]).fit() @@ -81,9 +87,9 @@ def test_basic_formulas(data, models, formula): if model is FirstDifferenceOLS: return - formula = formula.split('~') - formula[1] = ' 1 + ' + formula[1] - formula = '~'.join(formula) + formula = formula.split("~") + formula[1] = " 1 + " + formula[1] + formula = "~".join(formula) mod = model.from_formula(formula, joined) res = mod.fit() @@ -91,8 +97,8 @@ def test_basic_formulas(data, models, formula): res2 = mod2.fit() np.testing.assert_allclose(res.params, res2.params) - x['Intercept'] = 1.0 - vars = ['Intercept'] + vars + x["Intercept"] = 1.0 + vars = ["Intercept"] + vars mod2 = model(data.y, x[vars]) res2 = mod2.fit() np.testing.assert_allclose(res.params, res2.params) @@ -103,9 +109,9 @@ def test_basic_formulas_math_op(data, models, formula): if not isinstance(data.y, DataFrame): return joined = data.x - joined['y'] = data.y - formula = formula.replace('x0', 'np.exp(x0)') - formula = formula.replace('x1', 'sigmoid(x1)') + joined["y"] = data.y + formula = formula.replace("x0", "np.exp(x0)") + formula = formula.replace("x1", "sigmoid(x1)") model, formula_func = models res = model.from_formula(formula, joined).fit() pred = res.predict(data=joined) @@ -117,8 +123,8 @@ def test_panel_ols_formulas_math_op(data): if not isinstance(data.y, DataFrame): return joined = data.x - joined['y'] = data.y - formula = 'y ~ x1 + np.exp(x2)' + joined["y"] = data.y + formula = "y ~ x1 + np.exp(x2)" mod = PanelOLS.from_formula(formula, joined) mod.fit() @@ -127,24 +133,24 @@ def test_panel_ols_formula(data): if not isinstance(data.y, DataFrame): return joined = data.x - joined['y'] = data.y - formula = 'y ~ x1 + x2' + joined["y"] = data.y + formula = "y ~ x1 + x2" mod = PanelOLS.from_formula(formula, joined) assert mod.formula == formula - formula = 'y ~ x1 + x2 + EntityEffects' + formula = "y ~ x1 + x2 + EntityEffects" mod = PanelOLS.from_formula(formula, joined) assert mod.formula == formula assert mod.entity_effects is True assert mod.time_effects is False - formula = 'y ~ x1 + x2 + TimeEffects' + formula = "y ~ x1 + x2 + TimeEffects" mod = PanelOLS.from_formula(formula, joined) assert mod.formula == formula assert mod.time_effects is True assert mod.entity_effects is False - formula = 'y ~ x1 + EntityEffects + TimeEffects + x2 ' + formula = "y ~ x1 + EntityEffects + TimeEffects + x2 " mod = PanelOLS.from_formula(formula, joined) assert mod.formula == formula assert mod.entity_effects is True @@ -154,7 +160,7 @@ def test_panel_ols_formula(data): res2 = mod2.fit() np.testing.assert_allclose(res.params, res2.params) - formula = 'y ~ x1 + EntityEffects + FixedEffects + x2 ' + formula = "y ~ x1 + EntityEffects + FixedEffects + x2 " with pytest.raises(ValueError): PanelOLS.from_formula(formula, joined) @@ -163,7 +169,7 @@ def test_basic_formulas_predict(data, models, formula): if not isinstance(data.y, DataFrame): return joined = data.x - joined['y'] = data.y + joined["y"] = data.y model, formula_func = models mod = model.from_formula(formula, joined) res = mod.fit() @@ -174,8 +180,8 @@ def test_basic_formulas_predict(data, models, formula): pred2 = res2.predict(data=joined) np.testing.assert_allclose(pred.values, pred2.values, atol=1e-8) - parts = formula.split('~') - vars = parts[1].replace(' 1 ', ' const ').split('+') + parts = formula.split("~") + vars = parts[1].replace(" 1 ", " const ").split("+") vars = list(map(lambda s: s.strip(), vars)) x = data.x res2 = model(data.y, x[vars]).fit() @@ -187,15 +193,15 @@ def test_basic_formulas_predict(data, models, formula): if model is FirstDifferenceOLS: return - formula = formula.split('~') - formula[1] = ' 1 + ' + formula[1] - formula = '~'.join(formula) + formula = formula.split("~") + formula[1] = " 1 + " + formula[1] + formula = "~".join(formula) mod = model.from_formula(formula, joined) res = mod.fit() pred = res.predict(data=joined) - x['Intercept'] = 1.0 - vars = ['Intercept'] + vars + x["Intercept"] = 1.0 + vars = ["Intercept"] + vars mod2 = model(data.y, x[vars]) res2 = mod2.fit() pred2 = res.predict(x[vars]) @@ -208,7 +214,7 @@ def test_formulas_predict_error(data, models, formula): if not isinstance(data.y, DataFrame): return joined = data.x - joined['y'] = data.y + joined["y"] = data.y model, formula_func = models mod = model.from_formula(formula, joined) res = mod.fit() @@ -217,8 +223,8 @@ def test_formulas_predict_error(data, models, formula): with pytest.raises(ValueError): mod.predict(params=res.params, exog=joined, data=joined) - parts = formula.split('~') - vars = parts[1].replace(' 1 ', ' const ').split('+') + parts = formula.split("~") + vars = parts[1].replace(" 1 ", " const ").split("+") vars = list(map(lambda s: s.strip(), vars)) x = data.x res = model(data.y, x[vars]).fit() @@ -230,9 +236,9 @@ def test_parser(data, formula, effects): if not isinstance(data.y, DataFrame): return if effects: - formula += ' + EntityEffects + TimeEffects' + formula += " + EntityEffects + TimeEffects" joined = data.x - joined['y'] = data.y + joined["y"] = data.y parser = PanelFormulaParser(formula, joined) dep, exog = parser.data assert_frame_equal(parser.dependent, dep) @@ -241,10 +247,10 @@ def test_parser(data, formula, effects): assert parser.eval_env == 3 parser.eval_env = 2 assert parser.eval_env == 2 - assert parser.entity_effect == ('EntityEffects' in formula) - assert parser.time_effect == ('TimeEffects' in formula) + assert parser.entity_effect == ("EntityEffects" in formula) + assert parser.time_effect == ("TimeEffects" in formula) - formula += ' + FixedEffects ' + formula += " + FixedEffects " if effects: with pytest.raises(ValueError): PanelFormulaParser(formula, joined) diff --git a/linearmodels/tests/panel/test_model.py b/linearmodels/tests/panel/test_model.py index db2f40657d..b4094842e1 100644 --- a/linearmodels/tests/panel/test_model.py +++ b/linearmodels/tests/panel/test_model.py @@ -10,15 +10,23 @@ from linearmodels.panel.utility import AbsorbingEffectError from linearmodels.tests.panel._utility import datatypes, generate_data, lsdv -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) PERC_MISSING = [0, 0.02, 0.10, 0.33] TYPES = datatypes -@pytest.fixture(params=list(product(PERC_MISSING, TYPES)), - ids=list(map(lambda x: str(int(100 * x[0])) + '-' + str(x[1]), - product(PERC_MISSING, TYPES)))) +@pytest.fixture( + params=list(product(PERC_MISSING, TYPES)), + ids=list( + map( + lambda x: str(int(100 * x[0])) + "-" + str(x[1]), + product(PERC_MISSING, TYPES), + ) + ), +) def data(request): missing, datatype = request.param rng = np.random.RandomState(12345) @@ -155,7 +163,7 @@ def test_incorrect_weight_shape(data): w = data.w if isinstance(w, pd.DataFrame): entities = w.index.levels[0][:4] - w = w.loc[pd.IndexSlice[entities[0]:entities[-1]], :] + w = w.loc[pd.IndexSlice[entities[0] : entities[-1]], :] elif isinstance(w, np.ndarray): w = w[:3] w = w[None, :, :] @@ -170,7 +178,7 @@ def test_weight_ambiguity(data): if isinstance(data.x, pd.DataFrame): t = len(data.y.index.levels[1]) entities = data.x.index.levels[0] - slice = pd.IndexSlice[entities[0]:entities[t - 1]] + slice = pd.IndexSlice[entities[0] : entities[t - 1]] x = data.x.loc[slice, :] else: t = data.x.shape[1] @@ -181,7 +189,7 @@ def test_weight_ambiguity(data): PanelOLS(y, x, weights=weights) -@pytest.mark.parametrize('intercept', [True, False]) +@pytest.mark.parametrize("intercept", [True, False]) def test_absorbing_effect(data, intercept): x = data.x.copy() if isinstance(data.x, pd.DataFrame): @@ -189,15 +197,15 @@ def test_absorbing_effect(data, intercept): ntime = len(x.index.levels[1]) temp = data.x.iloc[:, 0].copy() temp.values[:] = 1.0 - temp.values[:(ntime * (nentity // 2))] = 0 + temp.values[: (ntime * (nentity // 2))] = 0 if intercept: - x['Intercept'] = 1.0 - x['absorbed'] = temp + x["Intercept"] = 1.0 + x["absorbed"] = temp else: intercept_vals = np.ones((1, x.shape[1], x.shape[2])) absorbed = np.ones((1, x.shape[1], x.shape[2])) - absorbed[:, :, :x.shape[2] // 2] = 0 + absorbed[:, :, : x.shape[2] // 2] = 0 if intercept: extra = [x, intercept_vals, absorbed] else: @@ -209,10 +217,10 @@ def test_absorbing_effect(data, intercept): mod.fit() var_names = mod.exog.vars assert var_names[3] in str(exc_info.value) - assert (' ' * (2 - intercept) + var_names[-1]) in str(exc_info.value) + assert (" " * (2 - intercept) + var_names[-1]) in str(exc_info.value) -@pytest.mark.filterwarnings('ignore::DeprecationWarning') +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_all_missing(data): y = PanelData(data.y) x = PanelData(data.x) @@ -220,6 +228,7 @@ def test_all_missing(data): y.drop(missing) x.drop(missing) import warnings + with warnings.catch_warnings(record=True) as w: PanelOLS(y.dataframe, x.dataframe).fit() assert len(w) == 0 diff --git a/linearmodels/tests/panel/test_panel_covariance.py b/linearmodels/tests/panel/test_panel_covariance.py index dbdd087e19..48822566a2 100644 --- a/linearmodels/tests/panel/test_panel_covariance.py +++ b/linearmodels/tests/panel/test_panel_covariance.py @@ -25,96 +25,162 @@ def setup_class(cls): cls.cluster5 = np.random.randint(0, 10, (cls.n * cls.t, 3)) def test_heteroskedastic_smoke(self): - cov = HeteroskedasticCovariance(self.y, self.x, self.params, self.entity_ids, - self.time_ids, extra_df=0).cov + cov = HeteroskedasticCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids, extra_df=0 + ).cov assert cov.shape == (self.k, self.k) - cov = HeteroskedasticCovariance(self.y, self.x, self.params, self.entity_ids, - self.time_ids, extra_df=0).cov + cov = HeteroskedasticCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids, extra_df=0 + ).cov assert cov.shape == (self.k, self.k) def test_homoskedastic_smoke(self): - cov = HomoskedasticCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0).cov + cov = HomoskedasticCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids, extra_df=0 + ).cov assert cov.shape == (self.k, self.k) - cov = HomoskedasticCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0).cov + cov = HomoskedasticCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids, extra_df=0 + ).cov assert cov.shape == (self.k, self.k) def test_clustered_covariance_smoke(self): - cov = ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0).cov - assert cov.shape == (self.k, self.k) - - cov = ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster1).cov - assert cov.shape == (self.k, self.k) - - cov = ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster2, group_debias=True).cov - assert cov.shape == (self.k, self.k) - - cov = ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster3).cov - assert cov.shape == (self.k, self.k) - cov = ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster3, group_debias=True).cov - assert cov.shape == (self.k, self.k) - - cov = ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster4).cov - assert cov.shape == (self.k, self.k) - - cov = ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster4, group_debias=True).cov + cov = ClusteredCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids, extra_df=0 + ).cov + assert cov.shape == (self.k, self.k) + + cov = ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster1, + ).cov + assert cov.shape == (self.k, self.k) + + cov = ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster2, + group_debias=True, + ).cov + assert cov.shape == (self.k, self.k) + + cov = ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster3, + ).cov + assert cov.shape == (self.k, self.k) + cov = ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster3, + group_debias=True, + ).cov + assert cov.shape == (self.k, self.k) + + cov = ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster4, + ).cov + assert cov.shape == (self.k, self.k) + + cov = ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster4, + group_debias=True, + ).cov assert cov.shape == (self.k, self.k) def test_clustered_covariance_error(self): with pytest.raises(ValueError): - ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster5) + ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster5, + ) with pytest.raises(ValueError): - ClusteredCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - extra_df=0, - clusters=self.cluster4[::2]) + ClusteredCovariance( + self.y, + self.x, + self.params, + self.entity_ids, + self.time_ids, + extra_df=0, + clusters=self.cluster4[::2], + ) def test_driscoll_kraay_smoke(self): - cov = DriscollKraay(self.y, self.x, self.params, self.entity_ids, self.time_ids).cov + cov = DriscollKraay( + self.y, self.x, self.params, self.entity_ids, self.time_ids + ).cov assert cov.shape == (self.k, self.k) - cov = DriscollKraay(self.y, self.x, self.params, self.entity_ids, self.time_ids, - kernel='parzen').cov + cov = DriscollKraay( + self.y, self.x, self.params, self.entity_ids, self.time_ids, kernel="parzen" + ).cov assert cov.shape == (self.k, self.k) - cov = DriscollKraay(self.y, self.x, self.params, self.entity_ids, self.time_ids, - bandwidth=12).cov + cov = DriscollKraay( + self.y, self.x, self.params, self.entity_ids, self.time_ids, bandwidth=12 + ).cov assert cov.shape == (self.k, self.k) def test_ac_covariance_smoke(self): - cov = ACCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids).cov + cov = ACCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids + ).cov assert cov.shape == (self.k, self.k) - cov = ACCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - kernel='parzen').cov + cov = ACCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids, kernel="parzen" + ).cov assert cov.shape == (self.k, self.k) - cov = ACCovariance(self.y, self.x, self.params, self.entity_ids, self.time_ids, - bandwidth=12).cov + cov = ACCovariance( + self.y, self.x, self.params, self.entity_ids, self.time_ids, bandwidth=12 + ).cov assert cov.shape == (self.k, self.k) def test_covariance_manager(): - cm = CovarianceManager('made-up-class', HomoskedasticCovariance, HeteroskedasticCovariance) + cm = CovarianceManager( + "made-up-class", HomoskedasticCovariance, HeteroskedasticCovariance + ) with pytest.raises(ValueError): - cm['clustered'] + cm["clustered"] with pytest.raises(KeyError): - cm['unknown'] + cm["unknown"] - assert cm['unadjusted'] is HomoskedasticCovariance - assert cm['homoskedastic'] is HomoskedasticCovariance - assert cm['robust'] is HeteroskedasticCovariance - assert cm['heteroskedastic'] is HeteroskedasticCovariance + assert cm["unadjusted"] is HomoskedasticCovariance + assert cm["homoskedastic"] is HomoskedasticCovariance + assert cm["robust"] is HeteroskedasticCovariance + assert cm["heteroskedastic"] is HeteroskedasticCovariance diff --git a/linearmodels/tests/panel/test_panel_ols.py b/linearmodels/tests/panel/test_panel_ols.py index 2d2a21f775..5c7f41bcd5 100644 --- a/linearmodels/tests/panel/test_panel_ols.py +++ b/linearmodels/tests/panel/test_panel_ols.py @@ -10,29 +10,33 @@ from linearmodels.iv.model import IV2SLS from linearmodels.panel.data import PanelData from linearmodels.panel.model import PanelOLS, PooledOLS +from linearmodels.panel.utility import AbsorbingEffectWarning from linearmodels.tests.panel._utility import (access_attributes, assert_frame_similar, assert_results_equal, datatypes, generate_data) -from linearmodels.panel.utility import AbsorbingEffectWarning from linearmodels.utility import AttrDict, MemoryWarning -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning', - 'ignore:the matrix subclass:PendingDeprecationWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning", + "ignore:the matrix subclass:PendingDeprecationWarning", +) missing = [0.0, 0.02, 0.20] has_const = [True, False] perms = list(product(missing, datatypes, has_const)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) def data(request): missing, datatype, const = request.param - return generate_data(missing, datatype, const=const, ntk=(91, 15, 5), other_effects=2) + return generate_data( + missing, datatype, const=const, ntk=(91, 15, 5), other_effects=2 + ) -@pytest.fixture(params=['numpy', 'pandas']) +@pytest.fixture(params=["numpy", "pandas"]) def absorbed_data(request): datatype = request.param rng = np.random.RandomState(12345) @@ -49,29 +53,37 @@ def absorbed_data(request): # pandas < 0.24 codes = data.x.index.labels absorbed = np.array(codes[0]).astype(np.double) - data.x['x_absorbed'] = absorbed + data.x["x_absorbed"] = absorbed return data @pytest.fixture(params=perms, ids=ids) def large_data(request): missing, datatype, const = request.param - return generate_data(missing, datatype, const=const, ntk=(51, 71, 5), other_effects=2) + return generate_data( + missing, datatype, const=const, ntk=(51, 71, 5), other_effects=2 + ) -singleton_ids = [i for i, p in zip(ids, perms) if p[1] == 'pandas' and not p[-1]] -singleton_perms = [p for p in perms if p[1] == 'pandas' and not p[-1]] +singleton_ids = [i for i, p in zip(ids, perms) if p[1] == "pandas" and not p[-1]] +singleton_perms = [p for p in perms if p[1] == "pandas" and not p[-1]] @pytest.fixture(params=singleton_perms, ids=singleton_ids) def singleton_data(request): missing, datatype, const = request.param - return generate_data(missing, datatype, const=const, ntk=(91, 15, 5), other_effects=2, - num_cats=[5 * 91, 15]) + return generate_data( + missing, + datatype, + const=const, + ntk=(91, 15, 5), + other_effects=2, + num_cats=[5 * 91, 15], + ) perms = list(product(missing, datatypes)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) @@ -81,7 +93,7 @@ def const_data(request): y = PanelData(data.y).dataframe x = y.copy() x.iloc[:, :] = 1 - x.columns = ['Const'] + x.columns = ["Const"] return AttrDict(y=y, x=x, w=PanelData(data.w).dataframe) @@ -95,22 +107,29 @@ def time_eff(request): return request.param -perms = [p for p in product([True, False], [True, False], [True, False], [0, 1, 2]) if - sum(p[1:]) <= 2] +perms = [ + p + for p in product([True, False], [True, False], [True, False], [0, 1, 2]) + if sum(p[1:]) <= 2 +] ids = [] for p in perms: - str_id = 'weighted' if p[0] else 'unweighted' - str_id += '-entity_effects' if p[1] else '' - str_id += '-time_effects' if p[2] else '' - str_id += '-{0}_other_effects'.format(p[3]) if p[3] else '' + str_id = "weighted" if p[0] else "unweighted" + str_id += "-entity_effects" if p[1] else "" + str_id += "-time_effects" if p[2] else "" + str_id += "-{0}_other_effects".format(p[3]) if p[3] else "" ids.append(str_id) @pytest.fixture(params=perms, ids=ids) def lsdv_config(request): weights, entity_effects, time_effects, other_effects = request.param - return AttrDict(weights=weights, entity_effects=entity_effects, time_effects=time_effects, - other_effects=other_effects) + return AttrDict( + weights=weights, + entity_effects=entity_effects, + time_effects=time_effects, + other_effects=other_effects, + ) def test_const_data_only(const_data): @@ -135,7 +154,7 @@ def test_const_data_entity(const_data): res = mod.fit(debiased=False) x = mod.exog.dataframe - d = mod.dependent.dummies('entity', drop_first=True) + d = mod.dependent.dummies("entity", drop_first=True) d.iloc[:, :] = d.values - x.values @ lstsq(x.values, d.values)[0] xd = np.c_[x.values, d.values] @@ -153,7 +172,7 @@ def test_const_data_entity_weights(const_data): y = mod.dependent.dataframe w = mod.weights.dataframe x = mod.exog.dataframe - d = mod.dependent.dummies('entity', drop_first=True) + d = mod.dependent.dummies("entity", drop_first=True) d_columns = list(d.columns) root_w = np.sqrt(w.values) @@ -175,7 +194,7 @@ def test_const_data_time(const_data): res = mod.fit(debiased=False) x = mod.exog.dataframe - d = mod.dependent.dummies('time', drop_first=True) + d = mod.dependent.dummies("time", drop_first=True) d.iloc[:, :] = d.values - x.values @ lstsq(x.values, d.values)[0] xd = np.c_[x.values, d.values] @@ -193,7 +212,7 @@ def test_const_data_time_weights(const_data): y = mod.dependent.dataframe w = mod.weights.dataframe x = mod.exog.dataframe - d = mod.dependent.dummies('time', drop_first=True) + d = mod.dependent.dummies("time", drop_first=True) d_columns = list(d.columns) root_w = np.sqrt(w.values) @@ -215,10 +234,10 @@ def test_const_data_both(const_data): res = mod.fit(debiased=False) x = mod.exog.dataframe - d1 = mod.dependent.dummies('entity', drop_first=True) - d1.columns = ['d.entity.{0}'.format(i) for i in d1] - d2 = mod.dependent.dummies('time', drop_first=True) - d2.columns = ['d.time.{0}'.format(i) for i in d2] + d1 = mod.dependent.dummies("entity", drop_first=True) + d1.columns = ["d.entity.{0}".format(i) for i in d1] + d2 = mod.dependent.dummies("time", drop_first=True) + d2.columns = ["d.time.{0}".format(i) for i in d2] d = np.c_[d1.values, d2.values] d = pd.DataFrame(d, index=x.index, columns=list(d1.columns) + list(d2.columns)) d.iloc[:, :] = d.values - x.values @ lstsq(x.values, d.values)[0] @@ -238,10 +257,10 @@ def test_const_data_both_weights(const_data): w = mod.weights.dataframe x = mod.exog.dataframe - d1 = mod.dependent.dummies('entity', drop_first=True) - d1.columns = ['d.entity.{0}'.format(i) for i in d1] - d2 = mod.dependent.dummies('time', drop_first=True) - d2.columns = ['d.time.{0}'.format(i) for i in d2] + d1 = mod.dependent.dummies("entity", drop_first=True) + d1.columns = ["d.entity.{0}".format(i) for i in d1] + d2 = mod.dependent.dummies("time", drop_first=True) + d2.columns = ["d.time.{0}".format(i) for i in d2] d = np.c_[d1.values, d2.values] root_w = np.sqrt(w.values) z = np.ones_like(x) @@ -276,53 +295,73 @@ def test_panel_entity_lsdv(data): y = mod.dependent.dataframe x = mod.exog.dataframe if mod.has_constant: - d = mod.dependent.dummies('entity', drop_first=True) + d = mod.dependent.dummies("entity", drop_first=True) z = np.ones_like(y) d_demean = d.values - z @ lstsq(z, d.values)[0] else: - d = mod.dependent.dummies('entity', drop_first=False) + d = mod.dependent.dummies("entity", drop_first=False) d_demean = d.values xd = np.c_[x.values, d_demean] xd = pd.DataFrame(xd, index=x.index, columns=list(x.columns) + list(d.columns)) ols_mod = IV2SLS(y, xd, None, None) - res2 = ols_mod.fit(cov_type='unadjusted', debiased=False) + res2 = ols_mod.fit(cov_type="unadjusted", debiased=False) assert_results_equal(res, res2, test_fit=False) assert_allclose(res.rsquared_inclusive, res2.rsquared) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_fit=False) clusters = data.vc1 ols_clusters = mod.reformat_clusters(data.vc1) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) clusters = data.vc2 ols_clusters = mod.reformat_clusters(data.vc2) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_time=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.time_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_time=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.time_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_entity=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.entity_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_entity=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.entity_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) @@ -333,22 +372,22 @@ def test_panel_entity_fwl(data): y = mod.dependent.dataframe x = mod.exog.dataframe if mod.has_constant: - d = mod.dependent.dummies('entity', drop_first=True) + d = mod.dependent.dummies("entity", drop_first=True) z = np.ones_like(y) d_demean = d.values - z @ lstsq(z, d.values)[0] else: - d = mod.dependent.dummies('entity', drop_first=False) + d = mod.dependent.dummies("entity", drop_first=False) d_demean = d.values x = x - d_demean @ lstsq(d_demean, x)[0] y = y - d_demean @ lstsq(d_demean, y)[0] ols_mod = IV2SLS(y, x, None, None) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_df=False) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_df=False) @@ -358,7 +397,7 @@ def test_panel_time_lsdv(large_data): y = mod.dependent.dataframe x = mod.exog.dataframe - d = mod.dependent.dummies('time', drop_first=mod.has_constant) + d = mod.dependent.dummies("time", drop_first=mod.has_constant) d_cols = list(d.columns) d = d.values if mod.has_constant: @@ -369,42 +408,62 @@ def test_panel_time_lsdv(large_data): xd = pd.DataFrame(xd, index=x.index, columns=list(x.columns) + d_cols) ols_mod = IV2SLS(y, xd, None, None) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_fit=False) assert_allclose(res.rsquared_inclusive, res2.rsquared) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_fit=False) clusters = large_data.vc1 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) clusters = large_data.vc2 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_time=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.time_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_time=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.time_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_entity=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.entity_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_entity=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.entity_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) @@ -414,7 +473,7 @@ def test_panel_time_fwl(data): y = mod.dependent.dataframe x = mod.exog.dataframe - d = mod.dependent.dummies('time', drop_first=mod.has_constant) + d = mod.dependent.dummies("time", drop_first=mod.has_constant) d = d.values if mod.has_constant: z = np.ones_like(y) @@ -424,11 +483,11 @@ def test_panel_time_fwl(data): y = y - d @ lstsq(d, y)[0] ols_mod = IV2SLS(y, x, None, None) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_df=False) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_df=False) @@ -438,8 +497,8 @@ def test_panel_both_lsdv(data): y = mod.dependent.dataframe x = mod.exog.dataframe - d1 = mod.dependent.dummies('entity', drop_first=mod.has_constant) - d2 = mod.dependent.dummies('time', drop_first=True) + d1 = mod.dependent.dummies("entity", drop_first=mod.has_constant) + d2 = mod.dependent.dummies("time", drop_first=True) d = np.c_[d1.values, d2.values] if mod.has_constant: @@ -447,47 +506,67 @@ def test_panel_both_lsdv(data): d = d - z @ lstsq(z, d)[0] xd = np.c_[x.values, d] - xd = pd.DataFrame(xd, - index=x.index, - columns=list(x.columns) + list(d1.columns) + list(d2.columns)) + xd = pd.DataFrame( + xd, index=x.index, columns=list(x.columns) + list(d1.columns) + list(d2.columns) + ) ols_mod = IV2SLS(y, xd, None, None) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_fit=False) assert_allclose(res.rsquared_inclusive, res2.rsquared) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_fit=False) clusters = data.vc1 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) clusters = data.vc2 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_time=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.time_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_time=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.time_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_entity=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.entity_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_entity=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.entity_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) @@ -497,8 +576,8 @@ def test_panel_both_fwl(data): y = mod.dependent.dataframe x = mod.exog.dataframe - d1 = mod.dependent.dummies('entity', drop_first=mod.has_constant) - d2 = mod.dependent.dummies('time', drop_first=True) + d1 = mod.dependent.dummies("entity", drop_first=mod.has_constant) + d2 = mod.dependent.dummies("time", drop_first=True) d = np.c_[d1.values, d2.values] if mod.has_constant: @@ -509,11 +588,11 @@ def test_panel_both_fwl(data): y = y - d @ lstsq(d, y)[0] ols_mod = IV2SLS(y, x, None, None) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_df=False) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_df=False) @@ -524,7 +603,7 @@ def test_panel_entity_lsdv_weighted(data): y = mod.dependent.dataframe x = mod.exog.dataframe w = mod.weights.dataframe - d = mod.dependent.dummies('entity', drop_first=mod.has_constant) + d = mod.dependent.dummies("entity", drop_first=mod.has_constant) d_cols = d.columns d = d.values if mod.has_constant: @@ -538,42 +617,62 @@ def test_panel_entity_lsdv_weighted(data): xd = pd.DataFrame(xd, index=x.index, columns=list(x.columns) + list(d_cols)) ols_mod = IV2SLS(y, xd, None, None, weights=w) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_fit=False) assert_allclose(res.rsquared_inclusive, res2.rsquared) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_fit=False) clusters = data.vc1 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) clusters = data.vc2 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_time=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.time_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_time=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.time_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_entity=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.entity_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_entity=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.entity_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) @@ -584,7 +683,7 @@ def test_panel_time_lsdv_weighted(large_data): y = mod.dependent.dataframe x = mod.exog.dataframe w = mod.weights.dataframe - d = mod.dependent.dummies('time', drop_first=mod.has_constant) + d = mod.dependent.dummies("time", drop_first=mod.has_constant) d_cols = d.columns d = d.values if mod.has_constant: @@ -598,53 +697,75 @@ def test_panel_time_lsdv_weighted(large_data): xd = pd.DataFrame(xd, index=x.index, columns=list(x.columns) + list(d_cols)) ols_mod = IV2SLS(y, xd, None, None, weights=w) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_fit=False) clusters = large_data.vc1 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) clusters = large_data.vc2 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_time=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.time_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_time=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.time_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_entity=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.entity_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_entity=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.entity_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) def test_panel_both_lsdv_weighted(data): - mod = PanelOLS(data.y, data.x, entity_effects=True, time_effects=True, weights=data.w) + mod = PanelOLS( + data.y, data.x, entity_effects=True, time_effects=True, weights=data.w + ) res = mod.fit(auto_df=False, count_effects=False, debiased=False) y = mod.dependent.dataframe x = mod.exog.dataframe w = mod.weights.dataframe - d1 = mod.dependent.dummies('entity', drop_first=mod.has_constant) - d2 = mod.dependent.dummies('time', drop_first=True) + d1 = mod.dependent.dummies("entity", drop_first=mod.has_constant) + d2 = mod.dependent.dummies("time", drop_first=True) d = np.c_[d1.values, d2.values] if mod.has_constant: @@ -655,47 +776,67 @@ def test_panel_both_lsdv_weighted(data): d = d - z @ lstsq(wz, wd)[0] xd = np.c_[x.values, d] - xd = pd.DataFrame(xd, - index=x.index, - columns=list(x.columns) + list(d1.columns) + list(d2.columns)) + xd = pd.DataFrame( + xd, index=x.index, columns=list(x.columns) + list(d1.columns) + list(d2.columns) + ) ols_mod = IV2SLS(y, xd, None, None, weights=w) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_fit=False) assert_allclose(res.rsquared_inclusive, res2.rsquared) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_fit=False) clusters = data.vc1 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) clusters = data.vc2 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_time=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.time_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_time=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.time_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_entity=True, auto_df=False, count_effects=False, - debiased=False) - clusters = pd.DataFrame(mod.dependent.entity_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_entity=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.entity_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) @@ -740,7 +881,7 @@ def test_panel_entity_time_other_equivalence(data): def test_panel_other_lsdv(data): mod = PanelOLS(data.y, data.x, other_effects=data.c) - assert 'Num Other Effects: 2' in str(mod) + assert "Num Other Effects: 2" in str(mod) res = mod.fit(auto_df=False, count_effects=False, debiased=False) y = mod.dependent.dataframe.copy() @@ -750,8 +891,10 @@ def test_panel_other_lsdv(data): d_columns = [] for i, col in enumerate(c): s = c[col].copy() - dummies = pd.get_dummies(s.astype(np.int64), drop_first=(mod.has_constant or i > 0)) - dummies.columns = [s.name + '_val_' + str(c) for c in dummies.columns] + dummies = pd.get_dummies( + s.astype(np.int64), drop_first=(mod.has_constant or i > 0) + ) + dummies.columns = [s.name + "_val_" + str(c) for c in dummies.columns] d_columns.extend(list(dummies.columns)) d.append(dummies.values) d = np.column_stack(d) @@ -764,44 +907,66 @@ def test_panel_other_lsdv(data): xd = pd.DataFrame(xd, index=x.index, columns=list(x.columns) + list(d_columns)) ols_mod = IV2SLS(y, xd, None, None) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_fit=False) - res3 = mod.fit(cov_type='unadjusted', auto_df=False, count_effects=False, debiased=False) + res3 = mod.fit( + cov_type="unadjusted", auto_df=False, count_effects=False, debiased=False + ) assert_results_equal(res, res3) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_fit=False) clusters = data.vc1 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, count_effects=False, - debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) clusters = data.vc2 ols_clusters = mod.reformat_clusters(clusters) - res = mod.fit(cov_type='clustered', clusters=clusters, auto_df=False, - count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='clustered', clusters=ols_clusters.dataframe) + res = mod.fit( + cov_type="clustered", + clusters=clusters, + auto_df=False, + count_effects=False, + debiased=False, + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=ols_clusters.dataframe) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_time=True, auto_df=False, - count_effects=False, debiased=False) - clusters = pd.DataFrame(mod.dependent.time_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_time=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.time_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) - res = mod.fit(cov_type='clustered', cluster_entity=True, auto_df=False, - count_effects=False, debiased=False) - clusters = pd.DataFrame(mod.dependent.entity_ids, - index=mod.dependent.index, - columns=['var.clust']) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters) + res = mod.fit( + cov_type="clustered", + cluster_entity=True, + auto_df=False, + count_effects=False, + debiased=False, + ) + clusters = pd.DataFrame( + mod.dependent.entity_ids, index=mod.dependent.index, columns=["var.clust"] + ) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters) assert_results_equal(res, res2, test_fit=False) @@ -816,8 +981,10 @@ def test_panel_other_fwl(data): d_columns = [] for i, col in enumerate(c): s = c[col].copy() - dummies = pd.get_dummies(s.astype(np.int64), drop_first=(mod.has_constant or i > 0)) - dummies.columns = [s.name + '_val_' + str(c) for c in dummies.columns] + dummies = pd.get_dummies( + s.astype(np.int64), drop_first=(mod.has_constant or i > 0) + ) + dummies.columns = [s.name + "_val_" + str(c) for c in dummies.columns] d_columns.extend(list(dummies.columns)) d.append(dummies.values) d = np.column_stack(d) @@ -830,11 +997,11 @@ def test_panel_other_fwl(data): y = y - d @ lstsq(d, y)[0] ols_mod = IV2SLS(y, x, None, None) - res2 = ols_mod.fit(cov_type='unadjusted') + res2 = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, res2, test_df=False) - res = mod.fit(cov_type='robust', auto_df=False, count_effects=False, debiased=False) - res2 = ols_mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust", auto_df=False, count_effects=False, debiased=False) + res2 = ols_mod.fit(cov_type="robust") assert_results_equal(res, res2, test_df=False) @@ -844,7 +1011,7 @@ def test_panel_other_incorrect_size(data): x = mod.exog.dataframe cats = pd.DataFrame(mod.dependent.entity_ids, index=mod.dependent.index) cats = PanelData(cats) - cats = cats.dataframe.iloc[:cats.dataframe.shape[0] // 2, :] + cats = cats.dataframe.iloc[: cats.dataframe.shape[0] // 2, :] with pytest.raises(ValueError): PanelOLS(y, x, other_effects=cats) @@ -869,7 +1036,7 @@ def test_results_access(data): const = PanelData(data.y).copy() const.dataframe.iloc[:, :] = 1 - const.dataframe.columns = ['const'] + const.dataframe.columns = ["const"] mod = PanelOLS(data.y, const) res = mod.fit() access_attributes(res) @@ -889,50 +1056,57 @@ def test_alt_rsquared_weighted(data): def test_too_many_effects(data): with pytest.raises(ValueError): - PanelOLS(data.y, data.x, entity_effects=True, time_effects=True, other_effects=data.c) + PanelOLS( + data.y, data.x, entity_effects=True, time_effects=True, other_effects=data.c + ) def test_cov_equiv_cluster(data): mod = PanelOLS(data.y, data.x, entity_effects=True) - res = mod.fit(cov_type='clustered', cluster_entity=True, debiased=False) + res = mod.fit(cov_type="clustered", cluster_entity=True, debiased=False) y = PanelData(data.y) clusters = pd.DataFrame(y.entity_ids, index=y.index) - res2 = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + res2 = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) assert_results_equal(res, res2) mod = PanelOLS(data.y, data.x, time_effects=True) - res = mod.fit(cov_type='clustered', cluster_time=True, debiased=False) + res = mod.fit(cov_type="clustered", cluster_time=True, debiased=False) y = PanelData(data.y) clusters = pd.DataFrame(y.time_ids, index=y.index) - res2 = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + res2 = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) assert_results_equal(res, res2) - res = mod.fit(cov_type='clustered', debiased=False) - res2 = mod.fit(cov_type='clustered', clusters=None, debiased=False) + res = mod.fit(cov_type="clustered", debiased=False) + res2 = mod.fit(cov_type="clustered", clusters=None, debiased=False) assert_results_equal(res, res2) def test_cluster_smoke(data): mod = PanelOLS(data.y, data.x, entity_effects=True) - mod.fit(cov_type='clustered', cluster_time=True, debiased=False) - mod.fit(cov_type='clustered', cluster_entity=True, debiased=False) + mod.fit(cov_type="clustered", cluster_time=True, debiased=False) + mod.fit(cov_type="clustered", cluster_entity=True, debiased=False) c2 = PanelData(data.vc2) c1 = PanelData(data.vc1) - mod.fit(cov_type='clustered', clusters=c2, debiased=False) - mod.fit(cov_type='clustered', cluster_entity=True, clusters=c1, debiased=False) - mod.fit(cov_type='clustered', cluster_time=True, clusters=c1, debiased=False) + mod.fit(cov_type="clustered", clusters=c2, debiased=False) + mod.fit(cov_type="clustered", cluster_entity=True, clusters=c1, debiased=False) + mod.fit(cov_type="clustered", cluster_time=True, clusters=c1, debiased=False) with pytest.raises(ValueError): - mod.fit(cov_type='clustered', cluster_time=True, clusters=c2, debiased=False) + mod.fit(cov_type="clustered", cluster_time=True, clusters=c2, debiased=False) with pytest.raises(ValueError): - mod.fit(cov_type='clustered', cluster_entity=True, clusters=c2, debiased=False) + mod.fit(cov_type="clustered", cluster_entity=True, clusters=c2, debiased=False) with pytest.raises(ValueError): - mod.fit(cov_type='clustered', cluster_entity=True, cluster_time=True, clusters=c1, - debiased=False) + mod.fit( + cov_type="clustered", + cluster_entity=True, + cluster_time=True, + clusters=c1, + debiased=False, + ) with pytest.raises(ValueError): - clusters = c1.dataframe.iloc[:c1.dataframe.shape[0] // 2] - mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + clusters = c1.dataframe.iloc[: c1.dataframe.shape[0] // 2] + mod.fit(cov_type="clustered", clusters=clusters, debiased=False) def test_f_pooled(data): @@ -943,7 +1117,7 @@ def test_f_pooled(data): mod2 = PooledOLS(data.y, data.x) else: exog = mod.exog.dataframe.copy() - exog['Intercept'] = 1.0 + exog["Intercept"] = 1.0 mod2 = PooledOLS(mod.dependent.dataframe, exog) res2 = mod2.fit(debiased=False) @@ -953,7 +1127,7 @@ def test_f_pooled(data): v1 = res.df_model - res2.df_model v2 = res.df_resid f_pool = (eps2.T @ eps2 - eps.T @ eps) / v1 - f_pool /= ((eps.T @ eps) / v2) + f_pool /= (eps.T @ eps) / v2 f_pool = float(f_pool) assert_allclose(res.f_pooled.stat, f_pool) assert res.f_pooled.df == v1 @@ -966,7 +1140,7 @@ def test_f_pooled(data): v1 = res.df_model - res2.df_model v2 = res.df_resid f_pool = (eps2.T @ eps2 - eps.T @ eps) / v1 - f_pool /= ((eps.T @ eps) / v2) + f_pool /= (eps.T @ eps) / v2 f_pool = float(f_pool) assert_allclose(res.f_pooled.stat, f_pool) assert res.f_pooled.df == v1 @@ -979,7 +1153,7 @@ def test_f_pooled(data): v1 = res.df_model - res2.df_model v2 = res.df_resid f_pool = (eps2.T @ eps2 - eps.T @ eps) / v1 - f_pool /= ((eps.T @ eps) / v2) + f_pool /= (eps.T @ eps) / v2 f_pool = float(f_pool) assert_allclose(res.f_pooled.stat, f_pool) assert res.f_pooled.df == v1 @@ -1019,10 +1193,14 @@ def test_methods_equivalent(data, lsdv_config): elif lsdv_config.other_effects == 2: other_effects = data.c weights = data.w if lsdv_config.weights else None - mod = PanelOLS(data.y, data.x, weights=weights, - entity_effects=lsdv_config.entity_effects, - time_effects=lsdv_config.time_effects, - other_effects=other_effects) + mod = PanelOLS( + data.y, + data.x, + weights=weights, + entity_effects=lsdv_config.entity_effects, + time_effects=lsdv_config.time_effects, + other_effects=other_effects, + ) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) res3 = mod.fit(use_lsmr=True) @@ -1065,7 +1243,9 @@ def test_panel_effects_sanity(data): expected += res.estimated_effects.values assert_allclose(mod.dependent.values2d, expected) - mod = PanelOLS(data.y, data.x, weights=data.w, entity_effects=True, time_effects=True) + mod = PanelOLS( + data.y, data.x, weights=data.w, entity_effects=True, time_effects=True + ) res = mod.fit(auto_df=False, count_effects=False) fitted = mod.exog.values2d @ res.params.values[:, None] expected = fitted @@ -1075,32 +1255,32 @@ def test_panel_effects_sanity(data): def test_fitted_effects_residuals(data, entity_eff, time_eff): - mod = PanelOLS(data.y, data.x, - entity_effects=entity_eff, - time_effects=time_eff) + mod = PanelOLS(data.y, data.x, entity_effects=entity_eff, time_effects=time_eff) res = mod.fit() expected = mod.exog.values2d @ res.params.values - expected = pd.DataFrame(expected, index=mod.exog.index, columns=['fitted_values']) + expected = pd.DataFrame(expected, index=mod.exog.index, columns=["fitted_values"]) assert_allclose(res.fitted_values, expected) assert_frame_similar(res.fitted_values, expected) expected.iloc[:, 0] = res.resids - expected.columns = ['idiosyncratic'] + expected.columns = ["idiosyncratic"] assert_allclose(res.idiosyncratic, expected) assert_frame_similar(res.idiosyncratic, expected) fitted_error = res.fitted_values + res.idiosyncratic.values expected.iloc[:, 0] = mod.dependent.values2d - fitted_error - expected.columns = ['estimated_effects'] + expected.columns = ["estimated_effects"] assert_allclose(res.estimated_effects, expected, atol=1e-8) assert_frame_similar(res.estimated_effects, expected) -@pytest.mark.parametrize('weighted', [True, False]) +@pytest.mark.parametrize("weighted", [True, False]) def test_low_memory(data, weighted): if weighted: - mod = PanelOLS(data.y, data.x, weights=data.w, entity_effects=True, time_effects=True) + mod = PanelOLS( + data.y, data.x, weights=data.w, entity_effects=True, time_effects=True + ) else: mod = PanelOLS(data.y, data.x, entity_effects=True, time_effects=True) res = mod.fit() @@ -1124,15 +1304,15 @@ def test_low_memory_auto(): mod.fit() -@pytest.mark.filterwarnings('ignore::linearmodels.utility.SingletonWarning') +@pytest.mark.filterwarnings("ignore::linearmodels.utility.SingletonWarning") def test_singleton_removal(): entities = [] for i in range(6): - entities.extend(['entity.{j}'.format(j=j) for j in range(6 - i)]) + entities.extend(["entity.{j}".format(j=j) for j in range(6 - i)]) nobs = len(entities) times = np.arange(nobs) % 6 index = pd.MultiIndex.from_arrays((entities, times)) - cols = ['x{0}'.format(i) for i in range(3)] + cols = ["x{0}".format(i) for i in range(3)] x = pd.DataFrame(np.random.randn(nobs, 3), index=index, columns=cols) y = pd.DataFrame(np.random.randn(nobs, 1), index=index) mod = PanelOLS(y, x, singletons=False, entity_effects=True, time_effects=True) @@ -1143,13 +1323,13 @@ def test_singleton_removal(): assert_allclose(res.params, res_with.params) -@pytest.mark.filterwarnings('ignore::linearmodels.utility.SingletonWarning') +@pytest.mark.filterwarnings("ignore::linearmodels.utility.SingletonWarning") def test_masked_singleton_removal(): nobs = 8 - entities = ['A', 'B', 'C', 'D'] * 2 + entities = ["A", "B", "C", "D"] * 2 times = [0, 1, 1, 1, 1, 2, 2, 2] index = pd.MultiIndex.from_arrays((entities, times)) - x = pd.DataFrame(np.random.randn(nobs, 1), index=index, columns=['x']) + x = pd.DataFrame(np.random.randn(nobs, 1), index=index, columns=["x"]) y = pd.DataFrame(np.random.randn(nobs, 1), index=index) mod = PanelOLS(y, x, singletons=False, entity_effects=True, time_effects=True) res = mod.fit() @@ -1157,30 +1337,37 @@ def test_masked_singleton_removal(): def test_singleton_removal_other_effects(data): - mod_keep = PanelOLS(data.y, data.x, weights=data.w, other_effects=data.c, singletons=True) + mod_keep = PanelOLS( + data.y, data.x, weights=data.w, other_effects=data.c, singletons=True + ) res_keep = mod_keep.fit() - mod = PanelOLS(data.y, data.x, weights=data.w, other_effects=data.c, singletons=False) - res = mod.fit(cov_type='clustered', clusters=data.vc1) + mod = PanelOLS( + data.y, data.x, weights=data.w, other_effects=data.c, singletons=False + ) + res = mod.fit(cov_type="clustered", clusters=data.vc1) assert res.nobs <= res_keep.nobs @pytest.mark.slow -@pytest.mark.filterwarnings('ignore::linearmodels.utility.SingletonWarning') -@pytest.mark.parametrize('other_effects', [1, 2]) +@pytest.mark.filterwarnings("ignore::linearmodels.utility.SingletonWarning") +@pytest.mark.parametrize("other_effects", [1, 2]) def test_singleton_removal_mixed(singleton_data, other_effects): if other_effects == 1: other_effects = PanelData(singleton_data.c).dataframe.iloc[:, [0]] elif other_effects == 2: other_effects = singleton_data.c - mod = PanelOLS(singleton_data.y, singleton_data.x, - other_effects=other_effects) + mod = PanelOLS(singleton_data.y, singleton_data.x, other_effects=other_effects) res_keep = mod.fit(use_lsmr=True) - mod = PanelOLS(singleton_data.y, singleton_data.x, - other_effects=other_effects, singletons=False) - res = mod.fit(cov_type='clustered', clusters=singleton_data.vc2, use_lsmr=True) + mod = PanelOLS( + singleton_data.y, + singleton_data.x, + other_effects=other_effects, + singletons=False, + ) + res = mod.fit(cov_type="clustered", clusters=singleton_data.vc2, use_lsmr=True) assert_allclose(res_keep.params, res.params) assert res.nobs <= res_keep.nobs @@ -1189,27 +1376,29 @@ def test_repeated_measures_weight(): # Issue reported by email rs = np.random.RandomState(0) w = rs.chisquare(5, 300) / 5 - idx1 = ['a']*100 + ['b']*100 + ['c']*100 + idx1 = ["a"] * 100 + ["b"] * 100 + ["c"] * 100 idx2 = np.arange(300) % 25 mi = pd.MultiIndex.from_arrays([idx1, idx2]) - df = pd.DataFrame(rs.standard_normal((300, 2)), - index=mi, columns=['y', 'x']) - w = pd.Series(w, index=mi, name='weight') - df['weight'] = w - mod = PanelOLS.from_formula('y ~ x + EntityEffects + TimeEffects', df, - weights=df['weight']) + df = pd.DataFrame(rs.standard_normal((300, 2)), index=mi, columns=["y", "x"]) + w = pd.Series(w, index=mi, name="weight") + df["weight"] = w + mod = PanelOLS.from_formula( + "y ~ x + EntityEffects + TimeEffects", df, weights=df["weight"] + ) res = mod.fit() - mod = PanelOLS.from_formula('y ~ x + EntityEffects + TimeEffects', df) + mod = PanelOLS.from_formula("y ~ x + EntityEffects + TimeEffects", df) res_un = mod.fit() assert res.params[0] != res_un.params[0] def test_absorbed(absorbed_data): - mod = PanelOLS(absorbed_data.y, absorbed_data.x, drop_absorbed=True, entity_effects=True) + mod = PanelOLS( + absorbed_data.y, absorbed_data.x, drop_absorbed=True, entity_effects=True + ) if isinstance(absorbed_data.y, pd.DataFrame): - match = 'x_absorbed' + match = "x_absorbed" else: - match = 'Exog.3' + match = "Exog.3" with pytest.warns(AbsorbingEffectWarning, match=match): res = mod.fit() if isinstance(absorbed_data.x, np.ndarray): diff --git a/linearmodels/tests/panel/test_pooled_ols.py b/linearmodels/tests/panel/test_pooled_ols.py index e71d13401b..61f63d4a4a 100644 --- a/linearmodels/tests/panel/test_pooled_ols.py +++ b/linearmodels/tests/panel/test_pooled_ols.py @@ -13,12 +13,14 @@ assert_results_equal, datatypes, generate_data) -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) missing = [0.0, 0.20] has_const = [True, False] perms = list(product(missing, datatypes, has_const)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) @@ -36,10 +38,10 @@ def test_pooled_ols(data): y.index = np.arange(len(y)) x.index = y.index - res2 = IV2SLS(y, x, None, None).fit(cov_type='unadjusted') + res2 = IV2SLS(y, x, None, None).fit(cov_type="unadjusted") assert_results_equal(res, res2) - res3 = mod.fit(cov_type='homoskedastic', debiased=False) + res3 = mod.fit(cov_type="homoskedastic", debiased=False) assert_results_equal(res, res3) @@ -53,14 +55,14 @@ def test_pooled_ols_weighted(data): y.index = np.arange(len(y)) w.index = x.index = y.index - res2 = IV2SLS(y, x, None, None, weights=w).fit(cov_type='unadjusted') + res2 = IV2SLS(y, x, None, None, weights=w).fit(cov_type="unadjusted") assert_results_equal(res, res2) def test_diff_data_size(data): if isinstance(data.x, pd.DataFrame): entities = data.x.index.levels[0] - x = data.x.loc[pd.IndexSlice[entities[0]:entities[-2]]] + x = data.x.loc[pd.IndexSlice[entities[0] : entities[-2]]] y = data.y elif isinstance(data.x, np.ndarray): x = data.x @@ -115,62 +117,63 @@ def test_alt_rsquared_weighted(data): def test_cov_equiv(data): mod = PooledOLS(data.y, data.x) - res = mod.fit(cov_type='robust', debiased=False) + res = mod.fit(cov_type="robust", debiased=False) y = mod.dependent.dataframe.copy() x = mod.exog.dataframe.copy() y.index = np.arange(len(y)) x.index = y.index - res2 = IV2SLS(y, x, None, None).fit(cov_type='robust') + res2 = IV2SLS(y, x, None, None).fit(cov_type="robust") assert_results_equal(res, res2) - res3 = mod.fit(cov_type='heteroskedastic', debiased=False) + res3 = mod.fit(cov_type="heteroskedastic", debiased=False) assert_results_equal(res, res3) def test_cov_equiv_weighted(data): mod = PooledOLS(data.y, data.x, weights=data.w) - res = mod.fit(cov_type='robust', debiased=False) + res = mod.fit(cov_type="robust", debiased=False) y = mod.dependent.dataframe.copy() x = mod.exog.dataframe.copy() w = mod.weights.dataframe.copy() y.index = np.arange(len(y)) w.index = x.index = y.index - res2 = IV2SLS(y, x, None, None, weights=w).fit(cov_type='robust') + res2 = IV2SLS(y, x, None, None, weights=w).fit(cov_type="robust") assert_results_equal(res, res2) - res3 = mod.fit(cov_type='heteroskedastic', debiased=False) + res3 = mod.fit(cov_type="heteroskedastic", debiased=False) assert_results_equal(res, res3) def test_cov_equiv_cluster(data): mod = PooledOLS(data.y, data.x) - res = mod.fit(cov_type='clustered', cluster_entity=True, debiased=False) + res = mod.fit(cov_type="clustered", cluster_entity=True, debiased=False) y = PanelData(data.y) clusters = pd.DataFrame(y.entity_ids, index=y.index) - res2 = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + res2 = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) assert_results_equal(res, res2) - res = mod.fit(cov_type='clustered', cluster_time=True, debiased=False) + res = mod.fit(cov_type="clustered", cluster_time=True, debiased=False) clusters = pd.DataFrame(y.time_ids, index=y.index) - res2 = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + res2 = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) assert_results_equal(res, res2) - res = mod.fit(cov_type='clustered', clusters=data.vc1, debiased=False) + res = mod.fit(cov_type="clustered", clusters=data.vc1, debiased=False) y = mod.dependent.dataframe.copy() x = mod.exog.dataframe.copy() y.index = np.arange(len(y)) x.index = y.index clusters = mod.reformat_clusters(data.vc1) ols_mod = IV2SLS(y, x, None, None) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters.dataframe, - debiased=False) + res2 = ols_mod.fit( + cov_type="clustered", clusters=clusters.dataframe, debiased=False + ) assert_results_equal(res, res2) def test_cov_equiv_cluster_weighted(data): mod = PooledOLS(data.y, data.x, weights=data.w) - res = mod.fit(cov_type='clustered', clusters=data.vc1, debiased=False) + res = mod.fit(cov_type="clustered", clusters=data.vc1, debiased=False) y = mod.dependent.dataframe.copy() x = mod.exog.dataframe.copy() @@ -179,7 +182,7 @@ def test_cov_equiv_cluster_weighted(data): w.index = x.index = y.index clusters = mod.reformat_clusters(data.vc1) ols_mod = IV2SLS(y, x, None, None, weights=w) - res2 = ols_mod.fit(cov_type='clustered', clusters=clusters.dataframe) + res2 = ols_mod.fit(cov_type="clustered", clusters=clusters.dataframe) assert_results_equal(res, res2) @@ -190,9 +193,9 @@ def test_two_way_clustering(data): entity_clusters = pd.DataFrame(y.entity_ids, index=y.index) vc1 = PanelData(data.vc1) clusters = vc1.copy() - clusters.dataframe['var.cluster.entity'] = entity_clusters + clusters.dataframe["var.cluster.entity"] = entity_clusters clusters._frame = clusters._frame.astype(np.int64) - res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) + res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) y = mod.dependent.dataframe.copy() x = mod.exog.dataframe.copy() @@ -201,7 +204,7 @@ def test_two_way_clustering(data): clusters = mod.reformat_clusters(clusters) ols_mod = IV2SLS(y, x, None, None) - ols_res = ols_mod.fit(cov_type='clustered', clusters=clusters.dataframe) + ols_res = ols_mod.fit(cov_type="clustered", clusters=clusters.dataframe) assert_results_equal(res, ols_res) @@ -209,16 +212,16 @@ def test_fitted_effects_residuals(data): mod = PooledOLS(data.y, data.x) res = mod.fit() expected = pd.DataFrame(res.resids.copy()) - expected.columns = ['idiosyncratic'] + expected.columns = ["idiosyncratic"] assert_allclose(res.idiosyncratic, expected) assert_frame_similar(res.idiosyncratic, expected) expected = mod.dependent.values2d - res.resids.values[:, None] - expected = pd.DataFrame(expected, index=res.resids.index, columns=['fitted_values']) + expected = pd.DataFrame(expected, index=res.resids.index, columns=["fitted_values"]) assert_allclose(res.fitted_values, expected) assert_frame_similar(res.fitted_values, expected) expected.iloc[:, 0] = np.nan - expected.columns = ['estimated_effects'] + expected.columns = ["estimated_effects"] assert_allclose(res.estimated_effects, expected) assert_frame_similar(res.estimated_effects, expected) diff --git a/linearmodels/tests/panel/test_random_effects.py b/linearmodels/tests/panel/test_random_effects.py index bd8160c1ab..7d6d6e6238 100644 --- a/linearmodels/tests/panel/test_random_effects.py +++ b/linearmodels/tests/panel/test_random_effects.py @@ -10,12 +10,14 @@ assert_frame_similar, datatypes, generate_data) -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) missing = [0.0, 0.20] has_const = [True, False] perms = list(product(missing, datatypes, has_const)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) @@ -30,17 +32,17 @@ def test_random_effects_small_sample(data): no_ss = mod.fit() ss = mod.fit(small_sample=True) if y.dataframe.shape[0] == mod.dependent.dataframe.shape[0]: - assert (ss.variance_decomposition.Effects == no_ss.variance_decomposition.Effects) + assert ss.variance_decomposition.Effects == no_ss.variance_decomposition.Effects else: - assert (ss.variance_decomposition.Effects != no_ss.variance_decomposition.Effects) + assert ss.variance_decomposition.Effects != no_ss.variance_decomposition.Effects mod = RandomEffects(data.y, data.x, weights=data.w) no_ss = mod.fit() ss = mod.fit(small_sample=True) if y.dataframe.shape[0] == mod.dependent.dataframe.shape[0]: - assert (ss.variance_decomposition.Effects == no_ss.variance_decomposition.Effects) + assert ss.variance_decomposition.Effects == no_ss.variance_decomposition.Effects else: - assert (ss.variance_decomposition.Effects != no_ss.variance_decomposition.Effects) + assert ss.variance_decomposition.Effects != no_ss.variance_decomposition.Effects def test_results_access(data): @@ -54,17 +56,17 @@ def test_fitted_effects_residuals(data): res = mod.fit() expected = mod.exog.values2d @ res.params.values - expected = pd.DataFrame(expected, index=mod.exog.index, columns=['fitted_values']) + expected = pd.DataFrame(expected, index=mod.exog.index, columns=["fitted_values"]) assert_allclose(res.fitted_values, expected) assert_frame_similar(res.fitted_values, expected) expected.iloc[:, 0] = res.resids - expected.columns = ['idiosyncratic'] + expected.columns = ["idiosyncratic"] assert_allclose(res.idiosyncratic, expected) assert_frame_similar(res.idiosyncratic, expected) fitted_error = res.fitted_values + res.idiosyncratic.values expected.iloc[:, 0] = mod.dependent.values2d - fitted_error - expected.columns = ['estimated_effects'] + expected.columns = ["estimated_effects"] assert_allclose(res.estimated_effects, expected) assert_frame_similar(res.estimated_effects, expected) diff --git a/linearmodels/tests/panel/test_results.py b/linearmodels/tests/panel/test_results.py index 71663bc71b..04cabb7a2e 100644 --- a/linearmodels/tests/panel/test_results.py +++ b/linearmodels/tests/panel/test_results.py @@ -23,72 +23,75 @@ def data(request): missing = [0.0, 0.02, 0.20] has_const = [True, False] perms = list(product(missing, datatypes, has_const)) -ids = list(map(lambda s: '-'.join(map(str, s)), perms)) +ids = list(map(lambda s: "-".join(map(str, s)), perms)) @pytest.fixture(params=perms, ids=ids) def generated_data(request): missing, datatype, const = request.param - return generate_data(missing, datatype, const=const, ntk=(91, 7, 5), other_effects=2) + return generate_data( + missing, datatype, const=const, ntk=(91, 7, 5), other_effects=2 + ) -@pytest.mark.parametrize('precision', ('tstats', 'std_errors', 'pvalues')) +@pytest.mark.parametrize("precision", ("tstats", "std_errors", "pvalues")) def test_single(data, precision): - dependent = data.set_index(['nr', 'year']).lwage - exog = add_constant(data.set_index(['nr', 'year'])[['expersq', 'married', 'union']]) + dependent = data.set_index(["nr", "year"]).lwage + exog = add_constant(data.set_index(["nr", "year"])[["expersq", "married", "union"]]) res = PanelOLS(dependent, exog, entity_effects=True).fit() comp = compare([res]) assert len(comp.rsquared) == 1 d = dir(comp) for value in d: - if value.startswith('_'): + if value.startswith("_"): continue getattr(comp, value) -@pytest.mark.parametrize('precision', ('tstats', 'std_errors', 'pvalues')) +@pytest.mark.parametrize("precision", ("tstats", "std_errors", "pvalues")) def test_multiple(data, precision): - dependent = data.set_index(['nr', 'year']).lwage - exog = add_constant(data.set_index(['nr', 'year'])[['expersq', 'married', 'union']]) + dependent = data.set_index(["nr", "year"]).lwage + exog = add_constant(data.set_index(["nr", "year"])[["expersq", "married", "union"]]) res = PanelOLS(dependent, exog, entity_effects=True, time_effects=True).fit() - res2 = PanelOLS(dependent, exog, entity_effects=True).fit(cov_type='clustered', - cluster_entity=True) - exog = add_constant(data.set_index(['nr', 'year'])[['married', 'union']]) + res2 = PanelOLS(dependent, exog, entity_effects=True).fit( + cov_type="clustered", cluster_entity=True + ) + exog = add_constant(data.set_index(["nr", "year"])[["married", "union"]]) res3 = PooledOLS(dependent, exog).fit() - exog = data.set_index(['nr', 'year'])[['exper']] + exog = data.set_index(["nr", "year"])[["exper"]] res4 = RandomEffects(dependent, exog).fit() comp = compare([res, res2, res3, res4], precision=precision) assert len(comp.rsquared) == 4 d = dir(comp) for value in d: - if value.startswith('_'): + if value.startswith("_"): continue getattr(comp, value) with pytest.raises(ValueError): - compare([res, res2, res3, res4], precision='unknown') + compare([res, res2, res3, res4], precision="unknown") def test_multiple_no_effects(data): - dependent = data.set_index(['nr', 'year']).lwage - exog = add_constant(data.set_index(['nr', 'year'])[['expersq', 'married', 'union']]) + dependent = data.set_index(["nr", "year"]).lwage + exog = add_constant(data.set_index(["nr", "year"])[["expersq", "married", "union"]]) res = PanelOLS(dependent, exog).fit() - exog = add_constant(data.set_index(['nr', 'year'])[['married', 'union']]) + exog = add_constant(data.set_index(["nr", "year"])[["married", "union"]]) res3 = PooledOLS(dependent, exog).fit() - exog = data.set_index(['nr', 'year'])[['exper']] + exog = data.set_index(["nr", "year"])[["exper"]] res4 = RandomEffects(dependent, exog).fit() comp = compare(dict(a=res, model2=res3, model3=res4)) assert len(comp.rsquared) == 3 d = dir(comp) for value in d: - if value.startswith('_'): + if value.startswith("_"): continue getattr(comp, value) compare(OrderedDict(a=res, model2=res3, model3=res4)) def test_incorrect_type(data): - dependent = data.set_index(['nr', 'year']).lwage - exog = add_constant(data.set_index(['nr', 'year'])[['expersq', 'married', 'union']]) + dependent = data.set_index(["nr", "year"]).lwage + exog = add_constant(data.set_index(["nr", "year"])[["expersq", "married", "union"]]) mod = PanelOLS(dependent, exog) res = mod.fit() mod2 = IV2SLS(mod.dependent.dataframe, mod.exog.dataframe, None, None) @@ -97,41 +100,41 @@ def test_incorrect_type(data): compare(dict(model1=res, model2=res2)) -@pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +@pytest.mark.filterwarnings("ignore::linearmodels.utility.MissingValueWarning") def test_predict(generated_data): mod = PanelOLS(generated_data.y, generated_data.x, entity_effects=True) res = mod.fit() pred = res.predict() nobs = mod.dependent.dataframe.shape[0] - assert list(pred.columns) == ['fitted_values'] + assert list(pred.columns) == ["fitted_values"] assert pred.shape == (nobs, 1) pred = res.predict(effects=True, idiosyncratic=True) - assert list(pred.columns) == ['fitted_values', 'estimated_effects', 'idiosyncratic'] + assert list(pred.columns) == ["fitted_values", "estimated_effects", "idiosyncratic"] assert pred.shape == (nobs, 3) assert_series_equal(pred.fitted_values, res.fitted_values.iloc[:, 0]) assert_series_equal(pred.estimated_effects, res.estimated_effects.iloc[:, 0]) assert_series_equal(pred.idiosyncratic, res.idiosyncratic.iloc[:, 0]) pred = res.predict(effects=True, idiosyncratic=True, missing=True) - assert list(pred.columns) == ['fitted_values', 'estimated_effects', 'idiosyncratic'] + assert list(pred.columns) == ["fitted_values", "estimated_effects", "idiosyncratic"] assert pred.shape == (PanelData(generated_data.y).dataframe.shape[0], 3) mod = PanelOLS(generated_data.y, generated_data.x) res = mod.fit() pred = res.predict() - assert list(pred.columns) == ['fitted_values'] + assert list(pred.columns) == ["fitted_values"] assert pred.shape == (nobs, 1) pred = res.predict(effects=True, idiosyncratic=True) - assert list(pred.columns) == ['fitted_values', 'estimated_effects', 'idiosyncratic'] + assert list(pred.columns) == ["fitted_values", "estimated_effects", "idiosyncratic"] assert pred.shape == (nobs, 3) assert_series_equal(pred.fitted_values, res.fitted_values.iloc[:, 0]) assert_series_equal(pred.estimated_effects, res.estimated_effects.iloc[:, 0]) assert_series_equal(pred.idiosyncratic, res.idiosyncratic.iloc[:, 0]) pred = res.predict(effects=True, idiosyncratic=True, missing=True) - assert list(pred.columns) == ['fitted_values', 'estimated_effects', 'idiosyncratic'] + assert list(pred.columns) == ["fitted_values", "estimated_effects", "idiosyncratic"] assert pred.shape == (PanelData(generated_data.y).dataframe.shape[0], 3) -@pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +@pytest.mark.filterwarnings("ignore::linearmodels.utility.MissingValueWarning") def test_predict_no_selection(generated_data): mod = PanelOLS(generated_data.y, generated_data.x, entity_effects=True) res = mod.fit() @@ -142,8 +145,8 @@ def test_predict_no_selection(generated_data): def test_wald_test(data): - dependent = data.set_index(['nr', 'year']).lwage - exog = add_constant(data.set_index(['nr', 'year'])[['expersq', 'married', 'union']]) + dependent = data.set_index(["nr", "year"]).lwage + exog = add_constant(data.set_index(["nr", "year"])[["expersq", "married", "union"]]) res = PanelOLS(dependent, exog, entity_effects=True, time_effects=True).fit() restriction = np.zeros((2, 4)) @@ -151,7 +154,7 @@ def test_wald_test(data): restriction[1, 3] = 1 t1 = res.wald_test(restriction) t2 = res.wald_test(restriction, np.zeros(2)) - formula = 'married = union = 0' + formula = "married = union = 0" t3 = res.wald_test(formula=formula) p = res.params.values[:, None] c = np.asarray(res.cov) diff --git a/linearmodels/tests/panel/test_simulated_against_stata.py b/linearmodels/tests/panel/test_simulated_against_stata.py index 7d54701e57..10d87d6db8 100644 --- a/linearmodels/tests/panel/test_simulated_against_stata.py +++ b/linearmodels/tests/panel/test_simulated_against_stata.py @@ -12,23 +12,28 @@ from linearmodels.utility import AttrDict pytestmark = pytest.mark.filterwarnings( - 'ignore::linearmodels.utility.MissingValueWarning') + "ignore::linearmodels.utility.MissingValueWarning" +) STATA_RESULTS = parse_stata_results.data() -MODELS = {'between': BetweenOLS, 'fixed_effect': PanelOLS, 'pooled': PooledOLS, - 'random_effect': RandomEffects} +MODELS = { + "between": BetweenOLS, + "fixed_effect": PanelOLS, + "pooled": PooledOLS, + "random_effect": RandomEffects, +} cwd = os.path.split(os.path.abspath(__file__))[0] -sim_data = pd.read_stata(os.path.join(cwd, 'results', 'simulated-panel.dta')) -sim_data = sim_data.set_index(['firm', 'time']) +sim_data = pd.read_stata(os.path.join(cwd, "results", "simulated-panel.dta")) +sim_data = sim_data.set_index(["firm", "time"]) valid = sorted(list(filter(lambda x: True, list(STATA_RESULTS.keys())))) -@pytest.fixture(params=valid, scope='module') +@pytest.fixture(params=valid, scope="module") def data(request): - model, vcv, weights, missing = request.param.split('-') - y_vars = ['y'] - x_vars = ['x1', 'x2', 'x3', 'x4', 'x5'] + model, vcv, weights, missing = request.param.split("-") + y_vars = ["y"] + x_vars = ["x1", "x2", "x3", "x4", "x5"] vars = y_vars + x_vars if missing: for i, v in enumerate(vars): @@ -36,45 +41,53 @@ def data(request): y_vars = vars[:1] x_vars = vars[1:] y = sim_data[y_vars] - x = sim_data[['intercept'] + x_vars] + x = sim_data[["intercept"] + x_vars] mod = MODELS[model] mod_options = {} - if model == 'fixed_effect': - mod_options = {'entity_effects': True} - if weights == 'weighted': - mod_options.update({'weights': sim_data['w']}) - fit_options = {'debiased': True} - if weights == 'wls': - fit_options.update({'reweight': True}) - if vcv == 'robust' and model not in ('fixed_effect', 'random_effect'): - fit_options.update({'cov_type': 'robust'}) - elif vcv in ('cluster', 'robust'): + if model == "fixed_effect": + mod_options = {"entity_effects": True} + if weights == "weighted": + mod_options.update({"weights": sim_data["w"]}) + fit_options = {"debiased": True} + if weights == "wls": + fit_options.update({"reweight": True}) + if vcv == "robust" and model not in ("fixed_effect", "random_effect"): + fit_options.update({"cov_type": "robust"}) + elif vcv in ("cluster", "robust"): y_data = PanelData(y) eid = y_data.entity_ids - entities = pd.DataFrame(eid, index=y_data.index, columns=['firm_ids']) - fit_options.update({'cov_type': 'clustered', 'clusters': entities}) + entities = pd.DataFrame(eid, index=y_data.index, columns=["firm_ids"]) + fit_options.update({"cov_type": "clustered", "clusters": entities}) else: - fit_options.update({'cov_type': 'unadjusted'}) + fit_options.update({"cov_type": "unadjusted"}) - if vcv == 'cluster' or ( - model in ('fixed_effect', 'random_effect') and vcv == 'robust'): - fit_options.update({'group_debias': True}) + if vcv == "cluster" or ( + model in ("fixed_effect", "random_effect") and vcv == "robust" + ): + fit_options.update({"group_debias": True}) spec_mod = mod(y, x, **mod_options) fit = spec_mod.fit(**fit_options) - return AttrDict(fit=fit, model=spec_mod, model_options=mod_options, y=y, - x=x, - stata=STATA_RESULTS[request.param], - fit_options=fit_options, - model_name=model, vcv=vcv, weights=weights, - missing=missing) + return AttrDict( + fit=fit, + model=spec_mod, + model_options=mod_options, + y=y, + x=x, + stata=STATA_RESULTS[request.param], + fit_options=fit_options, + model_name=model, + vcv=vcv, + weights=weights, + missing=missing, + ) # TODO: pvals, r2o, r2 def correct_order(stata, lm): repl = [] for c in stata.index: - if c == '_cons': - repl.append('intercept') + if c == "_cons": + repl.append("intercept") else: repl.append(c) stata = stata.copy() @@ -92,9 +105,8 @@ def test_params(data): def test_rsquared_between(data): - if (data.weights in ('weighted', 'wls') or - data.missing in ('_heavy', '_light')): - pytest.xfail(reason='Respect weights in calculation') + if data.weights in ("weighted", "wls") or data.missing in ("_heavy", "_light"): + pytest.xfail(reason="Respect weights in calculation") r2_between = data.fit.rsquared_between if np.isnan(data.stata.r2_b): return @@ -102,8 +114,8 @@ def test_rsquared_between(data): def test_rsquared_within(data): - if data.model_name == 'between': - pytest.xfail(reason='Use stricter definition of rsquared within') + if data.model_name == "between": + pytest.xfail(reason="Use stricter definition of rsquared within") r2_within = data.fit.rsquared_within if np.isnan(data.stata.r2_w): return @@ -115,8 +127,8 @@ def test_cov(data): stata = data.stata repl = [] for c in stata.variance.columns: - if c == '_cons': - repl.append('intercept') + if c == "_cons": + repl.append("intercept") else: repl.append(c) var = stata.variance.copy() @@ -127,10 +139,10 @@ def test_cov(data): def test_f_pooled(data): - f_pool = getattr(data.fit, 'f_pooled', None) + f_pool = getattr(data.fit, "f_pooled", None) stata_f_pool = data.stata.F_f if not f_pool or np.isnan(stata_f_pool): - pytest.skip('Result not available for testing') + pytest.skip("Result not available for testing") assert_allclose(f_pool.stat, stata_f_pool) @@ -146,8 +158,8 @@ def test_t_stat(data): stata_t = data.stata.params.tstat repl = [] for c in stata_t.index: - if c == '_cons': - repl.append('intercept') + if c == "_cons": + repl.append("intercept") else: repl.append(c) stata_t.index = repl diff --git a/linearmodels/tests/panel/test_utility.py b/linearmodels/tests/panel/test_utility.py index 9b0d3770f8..15b84561c4 100644 --- a/linearmodels/tests/panel/test_utility.py +++ b/linearmodels/tests/panel/test_utility.py @@ -9,13 +9,19 @@ from linearmodels.panel.utility import (dummy_matrix, in_2core_graph, in_2core_graph_slow, preconditioner) -formats = {'csc': scipy.sparse.csc.csc_matrix, 'csr': scipy.sparse.csr.csr_matrix, - 'coo': scipy.sparse.coo.coo_matrix, 'array': np.ndarray} +formats = { + "csc": scipy.sparse.csc.csc_matrix, + "csr": scipy.sparse.csr.csr_matrix, + "coo": scipy.sparse.coo.coo_matrix, + "array": np.ndarray, +} -pytestmark = pytest.mark.filterwarnings('ignore:the matrix subclass:PendingDeprecationWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore:the matrix subclass:PendingDeprecationWarning" +) -@pytest.fixture(scope='module', params=formats) +@pytest.fixture(scope="module", params=formats) def format(request): return request.param, formats[request.param] @@ -40,7 +46,7 @@ def test_dummy_last(): cats[10:, 0] = 2 cats[:, 1] = np.arange(15) % 5 cats[-1, 1] = 0 - out, _ = dummy_matrix(cats, drop='last', precondition=False) + out, _ = dummy_matrix(cats, drop="last", precondition=False) assert isinstance(out, scipy.sparse.csc.csc_matrix) assert out.shape == (15, 3 + 5 - 1) expected = np.array([5, 5, 5, 4, 3, 3, 3], dtype=np.int32) @@ -52,14 +58,14 @@ def test_invalid_format(): cats = np.zeros([10, 1], dtype=np.int8) cats[5:, 0] = 1 with pytest.raises(ValueError): - dummy_matrix(cats, format='unknown', precondition=False) + dummy_matrix(cats, format="unknown", precondition=False) def test_dummy_pandas(): - c1 = pd.Series(pd.Categorical(['a'] * 5 + ['b'] * 5 + ['c'] * 5)) - c2 = pd.Series(pd.Categorical(['A', 'B', 'C', 'D', 'E'] * 3)) + c1 = pd.Series(pd.Categorical(["a"] * 5 + ["b"] * 5 + ["c"] * 5)) + c2 = pd.Series(pd.Categorical(["A", "B", "C", "D", "E"] * 3)) cats = pd.concat([c1, c2], 1) - out, _ = dummy_matrix(cats, drop='last', precondition=False) + out, _ = dummy_matrix(cats, drop="last", precondition=False) assert isinstance(out, scipy.sparse.csc.csc_matrix) assert out.shape == (15, 3 + 5 - 1) expected = np.array([5, 5, 5, 3, 3, 3, 3], dtype=np.int32) @@ -67,12 +73,14 @@ def test_dummy_pandas(): def test_dummy_precondition(): - c1 = pd.Series(pd.Categorical(['a'] * 5 + ['b'] * 5 + ['c'] * 5)) - c2 = pd.Series(pd.Categorical(['A', 'B', 'C', 'D', 'E'] * 3)) + c1 = pd.Series(pd.Categorical(["a"] * 5 + ["b"] * 5 + ["c"] * 5)) + c2 = pd.Series(pd.Categorical(["A", "B", "C", "D", "E"] * 3)) cats = pd.concat([c1, c2], 1) - out_arr, cond_arr = dummy_matrix(cats, format='array', drop='last', precondition=True) - out_csc, cond_csc = dummy_matrix(cats, format='csc', drop='last', precondition=True) - out_csr, cond_csr = dummy_matrix(cats, format='csr', drop='last', precondition=True) + out_arr, cond_arr = dummy_matrix( + cats, format="array", drop="last", precondition=True + ) + out_csc, cond_csc = dummy_matrix(cats, format="csc", drop="last", precondition=True) + out_csr, cond_csr = dummy_matrix(cats, format="csr", drop="last", precondition=True) assert_allclose((out_arr ** 2).sum(0), np.ones(out_arr.shape[1])) assert_allclose((out_csc.multiply(out_csc)).sum(0).A1, np.ones(out_arr.shape[1])) assert_allclose(cond_arr, cond_csc) @@ -108,23 +116,23 @@ def test_drop_singletons_slow(): idx = np.arange(40000) - cols = {'c1': c1.copy(), 'c2': c2.copy()} + cols = {"c1": c1.copy(), "c2": c2.copy()} for i in range(40000): - last = cols['c1'].shape[0] + last = cols["c1"].shape[0] for col in cols: keep = in_2core_graph_slow(cols[col]) for col2 in cols: cols[col2] = cols[col2][keep] idx = idx[keep] - if cols['c1'].shape[0] == last: + if cols["c1"].shape[0] == last: break expected = np.concatenate([c1[idx], c2[idx]], 1) assert_array_equal(nonsingletons, expected) - expected = np.concatenate([cols['c1'], cols['c2']], 1) + expected = np.concatenate([cols["c1"], cols["c2"]], 1) assert_array_equal(nonsingletons, expected) - dummies, _ = dummy_matrix(cats, format='csr', precondition=False) + dummies, _ = dummy_matrix(cats, format="csr", precondition=False) to_drop = dummies[~retain] assert to_drop.sum() == 2 * (~retain).sum() @@ -154,10 +162,12 @@ def test_drop_singletons_pandas(): rs = np.random.RandomState(0) c1 = rs.randint(0, 10000, (40000, 1)) c2 = rs.randint(0, 20000, (40000, 1)) - df = [pd.Series(['{0}{1}'.format(let, c) for c in cat.ravel()], dtype='category') - for let, cat in zip('AB', (c1, c2))] + df = [ + pd.Series(["{0}{1}".format(let, c) for c in cat.ravel()], dtype="category") + for let, cat in zip("AB", (c1, c2)) + ] df = pd.concat(df, 1) - df.columns = ['cat1', 'cat2'] + df.columns = ["cat1", "cat2"] cats = df remain = in_2core_graph(cats) expected = in_2core_graph_slow(cats) diff --git a/linearmodels/tests/system/_utility.py b/linearmodels/tests/system/_utility.py index 1d4e17e1a1..90d2190068 100644 --- a/linearmodels/tests/system/_utility.py +++ b/linearmodels/tests/system/_utility.py @@ -6,8 +6,17 @@ from linearmodels.utility import AttrDict -def generate_data(n=500, k=10, p=3, const=True, rho=0.8, common_exog=False, - included_weights=False, output_dict=True, seed=1234): +def generate_data( + n=500, + k=10, + p=3, + const=True, + rho=0.8, + common_exog=False, + included_weights=False, + output_dict=True, + seed=1234, +): np.random.seed(seed) p = np.array(p) if p.ndim == 0: @@ -34,13 +43,13 @@ def generate_data(n=500, k=10, p=3, const=True, rho=0.8, common_exog=False, if included_weights: w = np.random.chisquare(5, (n, 1)) / 5 if output_dict: - data['equ.{0}'.format(i)] = {'dependent': y, 'exog': x} + data["equ.{0}".format(i)] = {"dependent": y, "exog": x} if included_weights: - data['equ.{0}'.format(i)]['weights'] = w + data["equ.{0}".format(i)]["weights"] = w else: - data['equ.{0}'.format(i)] = (y, x) + data["equ.{0}".format(i)] = (y, x) if included_weights: - data['equ.{0}'.format(i)] = tuple(list(data['equ.{0}'.format(i)]) + [w]) + data["equ.{0}".format(i)] = tuple(list(data["equ.{0}".format(i)]) + [w]) return data @@ -52,9 +61,21 @@ def atleast_k_elem(x, k): return x -def generate_3sls_data(n=500, k=10, p=3, en=2, instr=3, const=True, rho=0.8, kappa=0.5, - beta=0.5, common_exog=False, included_weights=False, output_dict=True, - seed=1234): +def generate_3sls_data( + n=500, + k=10, + p=3, + en=2, + instr=3, + const=True, + rho=0.8, + kappa=0.5, + beta=0.5, + common_exog=False, + included_weights=False, + output_dict=True, + seed=1234, +): np.random.seed(seed) p = atleast_k_elem(p, k) en = atleast_k_elem(en, k) @@ -79,25 +100,27 @@ def generate_3sls_data(n=500, k=10, p=3, en=2, instr=3, const=True, rho=0.8, kap for i, _p, _en, _instr in zip(range(k), p, en, instr): total = _p + _en + _instr corr = np.eye(_p + _en + _instr + 1) - corr[_p:_p + _en, _p:_p + _en] = kappa * np.eye(_en) - corr[_p:_p + _en, -1] = np.sqrt(1 - kappa ** 2) * np.ones(_en) - corr[_p + _en:_p + _en + _instr, _p:_p + _en] = beta * np.ones((_instr, _en)) + corr[_p : _p + _en, _p : _p + _en] = kappa * np.eye(_en) + corr[_p : _p + _en, -1] = np.sqrt(1 - kappa ** 2) * np.ones(_en) + corr[_p + _en : _p + _en + _instr, _p : _p + _en] = beta * np.ones( + (_instr, _en) + ) if _instr > 0: val = np.sqrt(1 - beta ** 2) / _instr * np.eye(_instr) - corr[_p + _en:_p + _en + _instr, _p + _en:_p + _en + _instr] = val + corr[_p + _en : _p + _en + _instr, _p + _en : _p + _en + _instr] = val if common_exog: shocks = np.random.standard_normal((n, total)) common_shocks = common_shocks if common_shocks is not None else shocks else: shocks = np.random.standard_normal((n, total)) - shocks = np.concatenate([shocks, eps[:, count:count + 1]], 1) + shocks = np.concatenate([shocks, eps[:, count : count + 1]], 1) variables = shocks @ corr.T - x = variables[:, :_p + _en] + x = variables[:, : _p + _en] exog = variables[:, :_p] - endog = variables[:, _p:_p + _en] - instr = variables[:, _p + _en:total] - e = variables[:, total:total + 1] + endog = variables[:, _p : _p + _en] + instr = variables[:, _p + _en : total] + e = variables[:, total : total + 1] if const: x = np.c_[np.ones((n, 1)), x] exog = np.c_[np.ones((n, 1)), exog] @@ -111,15 +134,19 @@ def generate_3sls_data(n=500, k=10, p=3, en=2, instr=3, const=True, rho=0.8, kap if _instr == 0: instr = None if output_dict: - data['equ.{0}'.format(count)] = {'dependent': dep, 'exog': exog, - 'endog': endog, 'instruments': instr} + data["equ.{0}".format(count)] = { + "dependent": dep, + "exog": exog, + "endog": endog, + "instruments": instr, + } if included_weights: - data['equ.{0}'.format(count)]['weights'] = w + data["equ.{0}".format(count)]["weights"] = w else: if included_weights: - data['equ.{0}'.format(count)] = (dep, exog, endog, instr, w) + data["equ.{0}".format(count)] = (dep, exog, endog, instr, w) else: - data['equ.{0}'.format(count)] = (dep, exog, endog, instr) + data["equ.{0}".format(count)] = (dep, exog, endog, instr) count += 1 return data @@ -134,8 +161,8 @@ def simple_sur(y, x): b.append(lstsq(x[i], y[i])[0]) eps.append(y[i] - x[i] @ b[-1]) b = np.vstack(b) - out['beta0'] = b - out['eps0'] = eps + out["beta0"] = b + out["eps0"] = eps eps = np.hstack(eps) nobs = eps.shape[0] sigma = eps.T @ eps / nobs @@ -153,12 +180,12 @@ def simple_sur(y, x): row = np.hstack(row) bx.append(row) bx = np.vstack(bx) - xpx = (bx.T @ omegainv @ bx) - xpy = (bx.T @ omegainv @ by) + xpx = bx.T @ omegainv @ bx + xpy = bx.T @ omegainv @ by beta1 = np.linalg.solve(xpx, xpy) - out['beta1'] = beta1 - out['xpx'] = xpx - out['xpy'] = xpy + out["beta1"] = beta1 + out["xpx"] = xpx + out["xpy"] = xpy return out @@ -174,12 +201,12 @@ def simple_3sls(y, x, z): b.append(lstsq(xhat[i], y[i])[0]) eps.append(y[i] - x[i] @ b[-1]) b = np.vstack(b) - out['beta0'] = b - out['eps0'] = eps + out["beta0"] = b + out["eps0"] = eps eps = np.hstack(eps) nobs = eps.shape[0] sigma = eps.T @ eps / nobs - out['sigma'] = sigma + out["sigma"] = sigma omega = np.kron(sigma, np.eye(nobs)) omegainv = np.linalg.inv(omega) by = np.vstack([y[i] for i in range(k)]) @@ -194,46 +221,48 @@ def simple_3sls(y, x, z): row = np.hstack(row) bx.append(row) bx = np.vstack(bx) - xpx = (bx.T @ omegainv @ bx) - xpy = (bx.T @ omegainv @ by) + xpx = bx.T @ omegainv @ bx + xpy = bx.T @ omegainv @ by beta1 = np.linalg.solve(xpx, xpy) - out['beta1'] = beta1 - out['xpx'] = xpx - out['xpy'] = xpy + out["beta1"] = beta1 + out["xpx"] = xpx + out["xpy"] = xpy idx = 0 eps = [] for i in range(k): k = x[i].shape[1] - b = beta1[idx:idx + k] + b = beta1[idx : idx + k] eps.append(y[i] - x[i] @ b) idx += k eps = np.hstack(eps) nobs = eps.shape[0] sigma = eps.T @ eps / nobs - out['eps'] = eps - out['cov'] = np.linalg.inv(bx.T @ omegainv @ bx) + out["eps"] = eps + out["cov"] = np.linalg.inv(bx.T @ omegainv @ bx) return out def convert_to_pandas(a, base): k = a.shape[1] - cols = [base + '_{0}'.format(i) for i in range(k)] + cols = [base + "_{0}".format(i) for i in range(k)] return pd.DataFrame(a, columns=cols) -def generate_simultaneous_data(n=500, nsystem=3, nexog=3, ninstr=2, const=True, seed=1234): +def generate_simultaneous_data( + n=500, nsystem=3, nexog=3, ninstr=2, const=True, seed=1234 +): np.random.seed(seed) k = nexog + nsystem * ninstr beta = np.random.chisquare(3, (k, nsystem)) / 3 gam = np.random.standard_normal((nsystem, nsystem)) / np.sqrt(3) - gam.flat[::nsystem + 1] = 1.0 + gam.flat[:: nsystem + 1] = 1.0 x = np.random.standard_normal((n, k)) for i in range(nsystem): mask = np.zeros(k) mask[:nexog] = 1.0 - mask[nexog + i * ninstr: nexog + (i + 1) * ninstr] = 1.0 + mask[nexog + i * ninstr : nexog + (i + 1) * ninstr] = 1.0 beta[:, i] *= mask if const: x = np.concatenate([np.ones((n, 1)), x], 1) @@ -243,10 +272,10 @@ def generate_simultaneous_data(n=500, nsystem=3, nexog=3, ninstr=2, const=True, gaminv = np.linalg.inv(gam) y = x @ beta @ gaminv + eps @ gaminv eqns = {} - deps = convert_to_pandas(np.squeeze(y), 'dependent') - exogs = convert_to_pandas(x, 'exog') + deps = convert_to_pandas(np.squeeze(y), "dependent") + exogs = convert_to_pandas(x, "exog") if const: - exogs.columns = ['const'] + list(exogs.columns[1:]) + exogs.columns = ["const"] + list(exogs.columns[1:]) for i in range(nsystem): dep = deps.iloc[:, i] idx = sorted(set(range(nsystem)).difference([i])) @@ -258,14 +287,24 @@ def generate_simultaneous_data(n=500, nsystem=3, nexog=3, ninstr=2, const=True, ex_idx = list(range(const + nexog)) + list(drop) exog = exogs.iloc[:, ex_idx] idx = set(range(const + nexog, x.shape[1])) - instr = convert_to_pandas(x[:, sorted(idx.difference(drop))], 'instruments') + instr = convert_to_pandas(x[:, sorted(idx.difference(drop))], "instruments") eqn = dict(dependent=dep, exog=exog, endog=endog, instruments=instr) eqns[dep.name] = eqn return eqns -def generate_3sls_data_v2(n=500, k=3, nexog=3, nendog=2, ninstr=3, const=True, rho=0.5, - output_dict=True, seed=1234, omitted='none'): +def generate_3sls_data_v2( + n=500, + k=3, + nexog=3, + nendog=2, + ninstr=3, + const=True, + rho=0.5, + output_dict=True, + seed=1234, + omitted="none", +): np.random.seed(seed) eqns = AttrDict() for i in range(k): @@ -288,25 +327,26 @@ def generate_3sls_data_v2(n=500, k=3, nexog=3, nendog=2, ninstr=3, const=True, r x = np.hstack([np.ones((n, 1)), x]) exog = np.hstack([np.ones((n, 1)), exog]) dep = x @ params + eps + nendog * np.random.standard_normal((n, 1)) - if omitted == 'none' or omitted == 'drop': + if omitted == "none" or omitted == "drop": if exog.shape[1] == 0: exog = None if endog.shape[1] == 0: endog = None if instr.shape[1] == 0: instr = None - eqn = AttrDict(dependent=dep, exog=exog, endog=endog, instruments=instr, - params=params) - eqns['eqn.{0}'.format(i)] = eqn + eqn = AttrDict( + dependent=dep, exog=exog, endog=endog, instruments=instr, params=params + ) + eqns["eqn.{0}".format(i)] = eqn if not output_dict: for key in eqns: eq = eqns[key] eqns[key] = (eq.dependent, eq.exog, eq.endog, eq.instruments) else: - if omitted == 'drop': + if omitted == "drop": for key in eqns: eq = eqns[key] - for key2 in ('exog', 'endog', 'instruments'): + for key2 in ("exog", "endog", "instruments"): if eq[key2] is None: del eq[key2] @@ -323,7 +363,7 @@ def simple_gmm(y, x, z, robust=True, steps=2): idx = 0 for i in range(len(x)): _k = x[i].shape[1] - _x[nobs * i:nobs * (i + 1), idx:idx + _k] = x[i] + _x[nobs * i : nobs * (i + 1), idx : idx + _k] = x[i] idx += _k x = _x @@ -331,7 +371,7 @@ def simple_gmm(y, x, z, robust=True, steps=2): _z = np.zeros((k * nobs, kz)) for i in range(len(z)): _k = z[i].shape[1] - _z[nobs * i:nobs * (i + 1), idx:idx + _k] = z[i] + _z[nobs * i : nobs * (i + 1), idx : idx + _k] = z[i] idx += _k z = _z @@ -384,5 +424,12 @@ def simple_gmm(y, x, z, robust=True, steps=2): ze = z * eps g_bar = ze.sum(0) / nobs j_stat = nobs * g_bar @ wi @ g_bar - return AttrDict(beta0=beta0.ravel(), beta1=beta1.ravel(), w0=w0, - w1=w, sigma=sigma, cov=cov, j_stat=j_stat) + return AttrDict( + beta0=beta0.ravel(), + beta1=beta1.ravel(), + w0=w0, + w1=w, + sigma=sigma, + cov=cov, + j_stat=j_stat, + ) diff --git a/linearmodels/tests/system/results/execute-stata-3sls.py b/linearmodels/tests/system/results/execute-stata-3sls.py index d65ef491c7..1ab9fd10b8 100644 --- a/linearmodels/tests/system/results/execute-stata-3sls.py +++ b/linearmodels/tests/system/results/execute-stata-3sls.py @@ -14,7 +14,7 @@ out = [] for key in data: eqn = data[key] - for key in ('exog', 'endog'): + for key in ("exog", "endog"): vals = eqn[key] for col in vals: if col in all_cols: @@ -23,9 +23,9 @@ out.append(vals[col]) all_cols.append(col) out = concat(out, 1) -if 'const' in out: - out.pop('const') -out.to_stata('simulated-3sls.dta', write_index=False) +if "const" in out: + out.pop("const") +out.to_stata("simulated-3sls.dta", write_index=False) SEP = """ file open myfile using {outfile}, write append @@ -41,17 +41,22 @@ (dependent_2 dependent_0 dependent_1 exog_1 exog_2 exog_3 exog_8 exog_9), {method} """ -STATA_PATH = os.path.join('C:\\', 'Program Files (x86)', 'Stata13', 'StataMP-64.exe') -OUTFILE = os.path.join(os.getcwd(), 'stata-3sls-results.txt') +STATA_PATH = os.path.join("C:\\", "Program Files (x86)", "Stata15", "StataMP-64.exe") +OUTFILE = os.path.join(os.getcwd(), "stata-3sls-results.txt") -header = [r'use "C:\git\linearmodels\linearmodels\tests\system\results\simulated-3sls.dta", clear'] +header = [ + r'use "C:\git\linearmodels\linearmodels\tests\system\results\simulated-3sls.dta", clear' +] -all_stats = 'estout using {outfile}, cells(b(fmt(%13.12g)) t(fmt(%13.12g)) p(fmt(%13.12g))) stats(' -stats = ['chi2_{0}', 'F_{0}', 'p_{0}', 'df_m{0}', 'mss_{0}', 'r2_{0}', 'rss_{0}'] +all_stats = "estout using {outfile}, cells(b(fmt(%13.12g)) t(fmt(%13.12g)) p(fmt(%13.12g))) stats(" +stats = ["chi2_{0}", "F_{0}", "p_{0}", "df_m{0}", "mss_{0}", "r2_{0}", "rss_{0}"] for i in range(1, 4): - all_stats += ' '.join(map(lambda s: s.format(i), stats)) + ' ' -all_stats += ') append' -output = all_stats + '\n' + """ + all_stats += " ".join(map(lambda s: s.format(i), stats)) + " " +all_stats += ") append" +output = ( + all_stats + + "\n" + + """ file open myfile using {outfile}, write append file write myfile "*********** Variance ****************" _n @@ -69,21 +74,22 @@ estout matrix(Sigma, fmt(%13.12g)) using {outfile}, append """ +) output = output.format(outfile=OUTFILE) -methods = ('3sls', '2sls', 'ols', 'sur', '3sls ireg3') +methods = ("3sls", "2sls", "ols", "sur", "3sls ireg3") -with open('three-sls.do', 'w') as stata_file: - stata_file.write('\n\n'.join(header)) +with open("three-sls.do", "w") as stata_file: + stata_file.write("\n\n".join(header)) for method in methods: stata_file.write(SEP.format(method=method, outfile=OUTFILE)) - stata_file.write('\n\n'.join([CMD.format(method=method), output])) + stata_file.write("\n\n".join([CMD.format(method=method), output])) if os.path.exists(OUTFILE): os.unlink(OUTFILE) -do_file = os.path.join(os.getcwd(), 'three-sls.do') -cmd = [STATA_PATH, '/e', 'do', do_file] -print(' '.join(cmd)) +do_file = os.path.join(os.getcwd(), "three-sls.do") +cmd = [STATA_PATH, "/e", "do", do_file] +print(" ".join(cmd)) subprocess.call(cmd) diff --git a/linearmodels/tests/system/results/execute-stata.py b/linearmodels/tests/system/results/execute-stata.py index 2360e3a89b..d526468e0c 100644 --- a/linearmodels/tests/system/results/execute-stata.py +++ b/linearmodels/tests/system/results/execute-stata.py @@ -12,17 +12,22 @@ from linearmodels.tests.system._utility import generate_data -STATA_PATH = os.path.join('C:\\', 'Program Files (x86)', 'Stata13', 'StataMP-64.exe') -OUTFILE = os.path.join(os.getcwd(), 'stata-sur-results.txt') +STATA_PATH = os.path.join("C:\\", "Program Files (x86)", "Stata13", "StataMP-64.exe") +OUTFILE = os.path.join(os.getcwd(), "stata-sur-results.txt") -header = [r'use "C:\git\linearmodels\linearmodels\tests\system\results\simulated-sur.dta", clear'] +header = [ + r'use "C:\git\linearmodels\linearmodels\tests\system\results\simulated-sur.dta", clear' +] -all_stats = 'estout using {outfile}, cells(b(fmt(%13.12g)) t(fmt(%13.12g)) p(fmt(%13.12g))) stats(' -stats = ['chi2_{0}', 'F_{0}', 'p_{0}', 'df_m{0}', 'mss_{0}', 'r2_{0}', 'rss_{0}'] +all_stats = "estout using {outfile}, cells(b(fmt(%13.12g)) t(fmt(%13.12g)) p(fmt(%13.12g))) stats(" +stats = ["chi2_{0}", "F_{0}", "p_{0}", "df_m{0}", "mss_{0}", "r2_{0}", "rss_{0}"] for i in range(1, 4): - all_stats += ' '.join(map(lambda s: s.format(i), stats)) + ' ' -all_stats += ') append' -output = all_stats + '\n' + """ + all_stats += " ".join(map(lambda s: s.format(i), stats)) + " " +all_stats += ") append" +output = ( + all_stats + + "\n" + + """ file open myfile using {outfile}, write append file write myfile "*********** Variance ****************" _n @@ -40,6 +45,7 @@ estout matrix(Sigma, fmt(%13.12g)) using {outfile}, append """ +) output = output.format(outfile=OUTFILE) data = generate_data(n=200, k=3, p=[2, 3, 4], const=True, seed=0) @@ -48,63 +54,63 @@ cmds = [] for i, dataset in enumerate((data, common_data, missing_data)): - base = 'mod_{0}'.format(i) - cmd = '' + base = "mod_{0}".format(i) + cmd = "" for j, key in enumerate(dataset): - dep = dataset[key]['dependent'] - dep = pd.DataFrame(dep, columns=[base + '_y_{0}'.format(j)]) - exog = dataset[key]['exog'][:, 1:] - exog_cols = [base + '_x_{0}{1}'.format(j, k) for k in range(exog.shape[1])] + dep = dataset[key]["dependent"] + dep = pd.DataFrame(dep, columns=[base + "_y_{0}".format(j)]) + exog = dataset[key]["exog"][:, 1:] + exog_cols = [base + "_x_{0}{1}".format(j, k) for k in range(exog.shape[1])] exog = pd.DataFrame(exog, columns=exog_cols) if i != 1 or j == 0: - cmd += ' ( ' + ' '.join(list(dep.columns) + list(exog.columns)) + ' ) ' + cmd += " ( " + " ".join(list(dep.columns) + list(exog.columns)) + " ) " else: - new_cmd = cmd[:cmd.find(')') + 1] - new_cmd = new_cmd.replace('mod_1_y_0', 'mod_1_y_{0}'.format(j)) + new_cmd = cmd[: cmd.find(")") + 1] + new_cmd = new_cmd.replace("mod_1_y_0", "mod_1_y_{0}".format(j)) cmd += new_cmd cmds.append(cmd) outcmds = {} -key_bases = ['basic', 'common', 'missing'] +key_bases = ["basic", "common", "missing"] for key_base, cmd in zip(key_bases, cmds): - base = 'sureg ' + cmd - ss = base + ', small dfk' - comp = cmd.replace('(', '').strip().split(')')[:-1] + base = "sureg " + cmd + ss = base + ", small dfk" + comp = cmd.replace("(", "").strip().split(")")[:-1] comp = list(map(lambda s: s.strip(), comp)) - deps = [c.split(' ')[0] for c in comp] - first = [c.split(' ')[1] for c in comp] + deps = [c.split(" ")[0] for c in comp] + first = [c.split(" ")[1] for c in comp] vals = {} i = 0 for d, f in zip(deps, first): - vals['y' + str(i)] = d - vals['x' + str(i)] = f + vals["y" + str(i)] = d + vals["x" + str(i)] = f i += 1 constraint = """ constraint 1 [{y0}]{x0} = [{y1}]{x1} constraint 2 [{y0}]{x0} = [{y2}]{x2} """ - cons = constraint.format(**vals) + base + ', const (1 2)' - outcmds[key_base + '-base'] = base - outcmds[key_base + '-ss'] = ss - outcmds[key_base + '-constrained'] = cons + cons = constraint.format(**vals) + base + ", const (1 2)" + outcmds[key_base + "-base"] = base + outcmds[key_base + "-ss"] = ss + outcmds[key_base + "-constrained"] = cons sep = """ file open myfile using {outfile}, write append \n file write myfile "#################!{key}!####################" _n \n file close myfile\n """ -with open('sur.do', 'w') as stata_file: - stata_file.write('\n'.join(header) + '\n') +with open("sur.do", "w") as stata_file: + stata_file.write("\n".join(header) + "\n") for outcmd in outcmds: stata_file.write(sep.format(outfile=OUTFILE, key=outcmd)) - stata_file.write(outcmds[outcmd] + '\n') - stata_file.write('\n{0}\n\n'.format(output)) - stata_file.write('\n' * 5) + stata_file.write(outcmds[outcmd] + "\n") + stata_file.write("\n{0}\n\n".format(output)) + stata_file.write("\n" * 5) if os.path.exists(OUTFILE): os.unlink(OUTFILE) -do_file = os.path.join(os.getcwd(), 'sur.do') -cmd = [STATA_PATH, '/e', 'do', do_file] -print(' '.join(cmd)) +do_file = os.path.join(os.getcwd(), "sur.do") +cmd = [STATA_PATH, "/e", "do", do_file] +print(" ".join(cmd)) subprocess.call(cmd) diff --git a/linearmodels/tests/system/results/generate_data.py b/linearmodels/tests/system/results/generate_data.py index 838525fb9b..55beda8d0b 100644 --- a/linearmodels/tests/system/results/generate_data.py +++ b/linearmodels/tests/system/results/generate_data.py @@ -19,33 +19,33 @@ np.random.seed(1234) for key in missing_data: - dep = missing_data[key]['dependent'] + dep = missing_data[key]["dependent"] locs = np.where(np.random.random_sample(dep.shape[0]) < 0.02)[0] if np.any(locs): dep.flat[locs] = np.nan - exog = missing_data[key]['exog'] + exog = missing_data[key]["exog"] locs = np.where(np.random.random_sample(np.prod(exog.shape)) < 0.02)[0] if np.any(locs): exog.flat[locs] = np.nan out = [] for i, dataset in enumerate((basic_data, common_data, missing_data)): - base = 'mod_{0}'.format(i) + base = "mod_{0}".format(i) for j, key in enumerate(dataset): - dep = dataset[key]['dependent'] - dep = pd.DataFrame(dep, columns=[base + '_y_{0}'.format(j)]) - dataset[key]['dependent'] = dep - exog = dataset[key]['exog'][:, 1:] - exog_cols = [base + '_x_{0}{1}'.format(j, k) for k in range(exog.shape[1])] + dep = dataset[key]["dependent"] + dep = pd.DataFrame(dep, columns=[base + "_y_{0}".format(j)]) + dataset[key]["dependent"] = dep + exog = dataset[key]["exog"][:, 1:] + exog_cols = [base + "_x_{0}{1}".format(j, k) for k in range(exog.shape[1])] exog = pd.DataFrame(exog, columns=exog_cols) exog = exog.copy() - exog['cons'] = 1.0 - dataset[key]['exog'] = exog + exog["cons"] = 1.0 + dataset[key]["exog"] = exog if i != 1 or j == 0: out.extend([dep, exog]) else: out.extend([dep]) -if __name__ == '__main__': +if __name__ == "__main__": df = concat(out, 1) - df.to_stata('simulated-sur.dta') + df.to_stata("simulated-sur.dta") diff --git a/linearmodels/tests/system/results/parse_stata_3sls_results.py b/linearmodels/tests/system/results/parse_stata_3sls_results.py index 135abae1fb..bb6640413d 100644 --- a/linearmodels/tests/system/results/parse_stata_3sls_results.py +++ b/linearmodels/tests/system/results/parse_stata_3sls_results.py @@ -11,75 +11,77 @@ def process_block(results): for i, line in enumerate(results): - if line.startswith('chi2_1'): + if line.startswith("chi2_1"): stat_start = i - elif '* Variance' in line: + elif "* Variance" in line: variance_start = i + 2 - elif '* Sigma' in line: + elif "* Sigma" in line: sigma_start = i + 2 param_results = results[:stat_start] - stats = results[stat_start:variance_start - 2] - variance = results[variance_start:sigma_start - 2] + stats = results[stat_start : variance_start - 2] + variance = results[variance_start : sigma_start - 2] sigma = results[sigma_start:] def parse_block(block): - values = pd.read_csv(StringIO('\n'.join(block)), header=None) + values = pd.read_csv(StringIO("\n".join(block)), header=None) nums = np.asarray(values.iloc[:, -1]) nums = np.reshape(nums, (len(nums) // 3, 3)) - values = pd.DataFrame(nums, index=values.iloc[::3, 0], columns=['param', 'tstat', 'pval']) - values.index.name = '' + values = pd.DataFrame( + nums, index=values.iloc[::3, 0], columns=["param", "tstat", "pval"] + ) + values.index.name = "" return values params = {} block = [] key = None for line in param_results[2:]: - contents = list(map(lambda s: s.strip(), line.split('\t'))) - if contents[0] != '' and contents[1] == '': + contents = list(map(lambda s: s.strip(), line.split("\t"))) + if contents[0] != "" and contents[1] == "": if key is not None: params[key] = parse_block(block) key = contents[0] block = [] else: - block.append(','.join(contents)) + block.append(",".join(contents)) params[key] = parse_block(block) stat_values = AttrDict() for line in stats: - contents = line.strip().split('\t') + contents = line.strip().split("\t") if len(contents) > 1 and contents[0] and contents[1]: stat_values[contents[0]] = float(contents[1]) stats = stat_values - variance = list(map(lambda s: s.replace('\t', ','), variance)) + variance = list(map(lambda s: s.replace("\t", ","), variance)) header = variance[0] block = [] for line in variance[1:]: - if ',,,' in line: + if ",,," in line: continue else: block.append(line) - out = pd.read_csv(StringIO(''.join([header] + block))) + out = pd.read_csv(StringIO("".join([header] + block))) out = out.iloc[:, 1:] - out.index = header.strip().split(',')[1:] + out.index = header.strip().split(",")[1:] vcv = out - sigma = list(map(lambda s: s.replace('\t', ','), sigma)) - sigma = pd.read_csv(StringIO(''.join(sigma)), index_col=0) + sigma = list(map(lambda s: s.replace("\t", ","), sigma)) + sigma = pd.read_csv(StringIO("".join(sigma)), index_col=0) return AttrDict(sigma=sigma, params=params, variance=vcv, stats=stats) -with open(os.path.join(base, 'stata-3sls-results.txt'), 'r') as stata_results: +with open(os.path.join(base, "stata-3sls-results.txt"), "r") as stata_results: stata_results = stata_results.readlines() block = [] results = {} key = None for line in stata_results: - if '!!!!' in line: + if "!!!!" in line: if key is not None: results[key] = process_block(block) - key = line.replace('!', '').strip() + key = line.replace("!", "").strip() block = [] else: block.append(line) diff --git a/linearmodels/tests/system/results/parse_stata_results.py b/linearmodels/tests/system/results/parse_stata_results.py index cd634601af..a072cd81c2 100644 --- a/linearmodels/tests/system/results/parse_stata_results.py +++ b/linearmodels/tests/system/results/parse_stata_results.py @@ -5,22 +5,22 @@ from linearmodels.utility import AttrDict -filename = 'stata-sur-results.txt' +filename = "stata-sur-results.txt" cwd = os.path.split(os.path.abspath(__file__))[0] -with open(os.path.join(cwd, filename), 'r') as results_file: +with open(os.path.join(cwd, filename), "r") as results_file: results = results_file.readlines() blocks = {} block = [] -key = '' +key = "" for line in results: - if '###!' in line: + if "###!" in line: if block: blocks[key] = block block = [] - key = line.strip().split('!')[1] + key = line.strip().split("!")[1] block = [] block.append(line) blocks[key] = block @@ -31,84 +31,86 @@ def split_block(block): block = block[:] for i, line in enumerate(block): - if '** Sigma **' in line: - sigma = block[i + 2:] + if "** Sigma **" in line: + sigma = block[i + 2 :] block = block[:i] for i, line in enumerate(block): - if '** Variance **' in line: - variance = block[i + 2:] + if "** Variance **" in line: + variance = block[i + 2 :] block = block[:i] for i, line in enumerate(block): - if 'chi2_' in line or 'F_' in line: + if "chi2_" in line or "F_" in line: stats = block[i:] params = block[:i] break - return AttrDict(sigma=process_sigma(sigma), - variance=process_variance(variance), - stats=process_stats(stats), - params=process_params(params)) + return AttrDict( + sigma=process_sigma(sigma), + variance=process_variance(variance), + stats=process_stats(stats), + params=process_params(params), + ) def process_stats(stats): - sio = StringIO(''.join(stats)) - values = pd.read_csv(sio, sep='\t', header=None, index_col=0, engine='c') - values.columns = ['value'] - values.index.name = 'stat' - values = values.astype('float64') + sio = StringIO("".join(stats)) + values = pd.read_csv(sio, sep="\t", header=None, index_col=0, engine="c") + values.columns = ["value"] + values.index.name = "stat" + values = values.astype("float64") return values def process_sigma(sigma): - sio = StringIO(''.join(sigma)) - values = pd.read_csv(sio, sep='\t', index_col=0) + sio = StringIO("".join(sigma)) + values = pd.read_csv(sio, sep="\t", index_col=0) return values def process_variance(variance): - key = '' + key = "" new = [variance[0]] for line in variance[1:]: - if '\t\t' in line: - key = line.split('\t')[0] + if "\t\t" in line: + key = line.split("\t")[0] continue - new.append(key + '_' + line) - sio = StringIO(''.join(new)) - values = pd.read_csv(sio, sep='\t', index_col=0) - values.index = [i.replace('__', '_') for i in values.index] - values.columns = [c.replace(':', '_').replace('__', '_') for c in values.columns] + new.append(key + "_" + line) + sio = StringIO("".join(new)) + values = pd.read_csv(sio, sep="\t", index_col=0) + values.index = [i.replace("__", "_") for i in values.index] + values.columns = [c.replace(":", "_").replace("__", "_") for c in values.columns] return values def process_params(params): reformatted = [] values = [] - key = var_name = '' + key = var_name = "" for line in params[3:]: - if '\t\n' in line: + if "\t\n" in line: if values: - new_line = key + '_' + var_name + '\t' + '\t'.join(values) + new_line = key + "_" + var_name + "\t" + "\t".join(values) reformatted.append(new_line) values = [] - key = line.split('\t')[0] + key = line.split("\t")[0] continue - if line.split('\t')[0].strip(): + if line.split("\t")[0].strip(): if values: - new_line = key + '_' + var_name + '\t' + '\t'.join(values) + new_line = key + "_" + var_name + "\t" + "\t".join(values) reformatted.append(new_line) values = [] - var_name = line.split('\t')[0].strip() - values.append(line.split('\t')[1].strip()) - new_line = key + '_' + var_name + '\t' + '\t'.join(values) + var_name = line.split("\t")[0].strip() + values.append(line.split("\t")[1].strip()) + new_line = key + "_" + var_name + "\t" + "\t".join(values) reformatted.append(new_line) - sio = StringIO('\n'.join(reformatted)) - values = pd.read_csv(sio, sep='\t', index_col=0, header=None) + sio = StringIO("\n".join(reformatted)) + values = pd.read_csv(sio, sep="\t", index_col=0, header=None) new_index = [] for idx in list(values.index): - new_index.append(idx.replace('__', '_')) + new_index.append(idx.replace("__", "_")) values.index = new_index - values.index.name = 'param' - values.columns = ['param', 'tstat', 'pval'] + values.index.name = "param" + values.columns = ["param", "tstat", "pval"] return values diff --git a/linearmodels/tests/system/test_3sls.py b/linearmodels/tests/system/test_3sls.py index f5b54f4b14..6a9196fd33 100644 --- a/linearmodels/tests/system/test_3sls.py +++ b/linearmodels/tests/system/test_3sls.py @@ -20,29 +20,35 @@ common_exog = [True, False] included_weights = [True, False] output_dict = [True, False] -params = list(product(nexog, nendog, ninstr, const, rho, common_exog, - included_weights, output_dict)) +params = list( + product( + nexog, nendog, ninstr, const, rho, common_exog, included_weights, output_dict + ) +) nexog = [[0, 1, 2]] nendog = [[1, 0, 1]] ninstr = [[2, 0, 1]] # Explicitly test variables that have no columns -add_params = list(product(nexog, nendog, ninstr, const, rho, common_exog, - included_weights, output_dict)) +add_params = list( + product( + nexog, nendog, ninstr, const, rho, common_exog, included_weights, output_dict + ) +) params += add_params def gen_id(param): - idstr = 'homo' if isinstance(param[0], list) else 'hetero' - idstr += '-homo_endog' if isinstance(param[1], list) else '-hetero_endog' - idstr += '-homo_instr' if isinstance(param[2], list) else '-hetero_instr' - idstr += '-const' if param[3] else '' - idstr += '-correl' if param[4] != 0 else '' - idstr += '-common' if param[5] else '' - idstr += '-weights' if param[6] else '' - idstr += '-dict' if param[7] else '-tuple' + idstr = "homo" if isinstance(param[0], list) else "hetero" + idstr += "-homo_endog" if isinstance(param[1], list) else "-hetero_endog" + idstr += "-homo_instr" if isinstance(param[2], list) else "-hetero_instr" + idstr += "-const" if param[3] else "" + idstr += "-correl" if param[4] != 0 else "" + idstr += "-common" if param[5] else "" + idstr += "-weights" if param[6] else "" + idstr += "-dict" if param[7] else "-tuple" return idstr @@ -59,6 +65,7 @@ def data(request): en = 2 instr = 3 elif list_like: + def safe_len(a): a = np.array(a) if a.ndim == 0: @@ -67,14 +74,23 @@ def safe_len(a): k = max(map(safe_len, [p, en, instr])) - return generate_3sls_data(n=250, k=k, p=p, en=en, instr=instr, const=const, rho=rho, - common_exog=common_exog, included_weights=included_weights, - output_dict=output_dict) + return generate_3sls_data( + n=250, + k=k, + p=p, + en=en, + instr=instr, + const=const, + rho=rho, + common_exog=common_exog, + included_weights=included_weights, + output_dict=output_dict, + ) def test_direct_simple(data): mod = IV3SLS(data) - res = mod.fit(cov_type='unadjusted') + res = mod.fit(cov_type="unadjusted") y = [] x = [] @@ -92,14 +108,18 @@ def test_direct_simple(data): if len(val) == 5: return # weighted else: - y.append(val['dependent']) - nobs = val['dependent'].shape[0] - vexog = val['exog'] if val['exog'] is not None else np.empty((nobs, 0)) - vendog = val['endog'] if val['endog'] is not None else np.empty((nobs, 0)) - vinstr = val['instruments'] if val['instruments'] is not None else np.empty((nobs, 0)) + y.append(val["dependent"]) + nobs = val["dependent"].shape[0] + vexog = val["exog"] if val["exog"] is not None else np.empty((nobs, 0)) + vendog = val["endog"] if val["endog"] is not None else np.empty((nobs, 0)) + vinstr = ( + val["instruments"] + if val["instruments"] is not None + else np.empty((nobs, 0)) + ) x.append(np.concatenate([vexog, vendog], 1)) z.append(np.concatenate([vexog, vinstr], 1)) - if 'weights' in val: + if "weights" in val: return # weighted out = simple_3sls(y, x, z) assert_allclose(res.params.values, out.beta1.squeeze()) @@ -113,7 +133,7 @@ def test_single_equation(data): data = {key: data[key]} mod = IV3SLS(data) - res = mod.fit(cov_type='unadjusted') + res = mod.fit(cov_type="unadjusted") y = [] x = [] @@ -127,10 +147,10 @@ def test_single_equation(data): if len(val) == 5: return # weighted else: - y.append(val['dependent']) - x.append(np.concatenate([val['exog'], val['endog']], 1)) - z.append(np.concatenate([val['exog'], val['instruments']], 1)) - if 'weights' in val: + y.append(val["dependent"]) + x.append(np.concatenate([val["exog"], val["endog"]], 1)) + z.append(np.concatenate([val["exog"], val["instruments"]], 1)) + if "weights" in val: return # weighted out = simple_3sls(y, x, z) assert_allclose(res.params.values, out.beta1.squeeze()) @@ -147,7 +167,7 @@ def test_too_few_instruments(): instr = np.random.standard_normal((n, 1)) eqns = {} for i in range(2): - eqns['eqn.{0}'.format(i)] = (dep[:, i], exog, endog, instr) + eqns["eqn.{0}".format(i)] = (dep[:, i], exog, endog, instr) with pytest.raises(ValueError): IV3SLS(eqns) @@ -161,7 +181,7 @@ def test_redundant_instruments(): instr = np.concatenate([exog, instr], 1) eqns = {} for i in range(2): - eqns['eqn.{0}'.format(i)] = (dep[:, i], exog, endog, instr) + eqns["eqn.{0}".format(i)] = (dep[:, i], exog, endog, instr) with pytest.raises(ValueError): IV3SLS(eqns) @@ -174,7 +194,7 @@ def test_too_many_instruments(): instr = np.random.standard_normal((n, n + 1)) eqns = {} for i in range(2): - eqns['eqn.{0}'.format(i)] = (dep[:, i], exog, endog, instr) + eqns["eqn.{0}".format(i)] = (dep[:, i], exog, endog, instr) with pytest.raises(ValueError): IV3SLS(eqns) @@ -203,14 +223,14 @@ def test_multivariate_iv(): n = 250 dep = np.random.standard_normal((n, 2)) exog = np.random.standard_normal((n, 3)) - exog = DataFrame(exog, columns=['exog.{0}'.format(i) for i in range(3)]) + exog = DataFrame(exog, columns=["exog.{0}".format(i) for i in range(3)]) endog = np.random.standard_normal((n, 2)) - endog = DataFrame(endog, columns=['endog.{0}'.format(i) for i in range(2)]) + endog = DataFrame(endog, columns=["endog.{0}".format(i) for i in range(2)]) instr = np.random.standard_normal((n, 3)) - instr = DataFrame(instr, columns=['instr.{0}'.format(i) for i in range(3)]) + instr = DataFrame(instr, columns=["instr.{0}".format(i) for i in range(3)]) eqns = {} for i in range(2): - eqns['dependent.{0}'.format(i)] = (dep[:, i], exog, endog, instr) + eqns["dependent.{0}".format(i)] = (dep[:, i], exog, endog, instr) mod = IV3SLS(eqns) res = mod.fit() @@ -224,7 +244,7 @@ def test_multivariate_iv_bad_data(): n = 250 dep = np.random.standard_normal((n, 2)) instr = np.random.standard_normal((n, 3)) - instr = DataFrame(instr, columns=['instr.{0}'.format(i) for i in range(3)]) + instr = DataFrame(instr, columns=["instr.{0}".format(i) for i in range(3)]) with pytest.raises(ValueError): IV3SLS.multivariate_ls(dep, None, None, instr) @@ -237,15 +257,18 @@ def test_fitted(data): for i, key in enumerate(res.equations): eq = res.equations[key] fv = res.fitted_values[key].copy() - fv.name = 'fitted_values' + fv.name = "fitted_values" assert_series_equal(eq.fitted_values, fv) b = eq.params.values direct = mod._x[i] @ b expected.append(direct[:, None]) assert_allclose(eq.fitted_values, direct, atol=1e-8) expected = np.concatenate(expected, 1) - expected = DataFrame(expected, index=mod._dependent[i].pandas.index, - columns=[key for key in res.equations]) + expected = DataFrame( + expected, + index=mod._dependent[i].pandas.index, + columns=[key for key in res.equations], + ) assert_frame_equal(expected, res.fitted_values) @@ -254,11 +277,11 @@ def test_no_exog(): mod = IV3SLS(data) res = mod.fit() - data = generate_3sls_data_v2(nexog=0, const=False, omitted='drop') + data = generate_3sls_data_v2(nexog=0, const=False, omitted="drop") mod = IV3SLS(data) res2 = mod.fit() - data = generate_3sls_data_v2(nexog=0, const=False, omitted='empty') + data = generate_3sls_data_v2(nexog=0, const=False, omitted="empty") mod = IV3SLS(data) res3 = mod.fit() @@ -266,7 +289,9 @@ def test_no_exog(): mod = IV3SLS(data) res4 = mod.fit() - data = generate_3sls_data_v2(nexog=0, const=False, output_dict=False, omitted='empty') + data = generate_3sls_data_v2( + nexog=0, const=False, output_dict=False, omitted="empty" + ) mod = IV3SLS(data) res5 = mod.fit() assert_series_equal(res.params, res2.params) @@ -280,11 +305,11 @@ def test_no_endog(): mod = IV3SLS(data) res = mod.fit() - data = generate_3sls_data_v2(nendog=0, ninstr=0, omitted='drop') + data = generate_3sls_data_v2(nendog=0, ninstr=0, omitted="drop") mod = IV3SLS(data) res2 = mod.fit() - data = generate_3sls_data_v2(nendog=0, ninstr=0, omitted='empty') + data = generate_3sls_data_v2(nendog=0, ninstr=0, omitted="empty") mod = IV3SLS(data) res3 = mod.fit() @@ -292,7 +317,7 @@ def test_no_endog(): mod = IV3SLS(data) res4 = mod.fit() - data = generate_3sls_data_v2(nendog=0, ninstr=0, output_dict=False, omitted='empty') + data = generate_3sls_data_v2(nendog=0, ninstr=0, output_dict=False, omitted="empty") mod = IV3SLS(data) res5 = mod.fit() assert_series_equal(res.params, res2.params) @@ -304,6 +329,6 @@ def test_no_endog(): def test_uneven_shapes(): data = generate_3sls_data_v2() eq = data[list(data.keys())[0]] - eq['weights'] = np.ones(eq.dependent.shape[0] // 2) + eq["weights"] = np.ones(eq.dependent.shape[0] // 2) with pytest.raises(ValueError): IV3SLS(data) diff --git a/linearmodels/tests/system/test_3sls_against_stata.py b/linearmodels/tests/system/test_3sls_against_stata.py index 744ef7153b..881f562743 100644 --- a/linearmodels/tests/system/test_3sls_against_stata.py +++ b/linearmodels/tests/system/test_3sls_against_stata.py @@ -8,32 +8,37 @@ from linearmodels.tests.system.results.parse_stata_3sls_results import results -@pytest.fixture(scope='module', params=list(results.keys())) +@pytest.fixture(scope="module", params=list(results.keys())) def fit(request): method = request.param data = generate_simultaneous_data() - if 'ols' in method or 'sur' in method: + if "ols" in method or "sur" in method: mod = SUR for key in data: temp = data[key] - temp['exog'] = concat([temp['exog'], temp['endog']], 1) - del temp['endog'] - del temp['instruments'] + temp["exog"] = concat([temp["exog"], temp["endog"]], 1) + del temp["endog"] + del temp["instruments"] else: mod = IV3SLS - if 'ols' in method or '2sls' in method: - fit_method = 'ols' + if "ols" in method or "2sls" in method: + fit_method = "ols" else: - fit_method = 'gls' + fit_method = "gls" mod = mod(data) - iterate = 'ireg3' in method + iterate = "ireg3" in method stata = results[method] - debiased = method in ('ols', '2sls') + debiased = method in ("ols", "2sls") kwargs = {} - decimal = 2 if 'ireg3' in method else 5 + decimal = 3 if "ireg3" in method else 5 rtol = 10 ** -decimal - res = mod.fit(cov_type='unadjusted', method=fit_method, - debiased=debiased, iterate=iterate, **kwargs) + res = mod.fit( + cov_type="unadjusted", + method=fit_method, + debiased=debiased, + iterate=iterate, + **kwargs + ) return stata, res, rtol @@ -42,10 +47,10 @@ def test_params(fit): for idx in result.params.index: val = result.params[idx] - dep = '_'.join(idx.split('_')[:2]) - variable = '_'.join(idx.split('_')[2:]) - variable = '_cons' if variable == 'const' else variable - stata_val = stata.params[dep].loc[variable, 'param'] + dep = "_".join(idx.split("_")[:2]) + variable = "_".join(idx.split("_")[2:]) + variable = "_cons" if variable == "const" else variable + stata_val = stata.params[dep].loc[variable, "param"] assert_allclose(stata_val, val, rtol=rtol) @@ -55,10 +60,10 @@ def test_tstats(fit): for idx in result.tstats.index: val = result.tstats[idx] - dep = '_'.join(idx.split('_')[:2]) - variable = '_'.join(idx.split('_')[2:]) - variable = '_cons' if variable == 'const' else variable - stata_val = stata.params[dep].loc[variable, 'tstat'] + dep = "_".join(idx.split("_")[:2]) + variable = "_".join(idx.split("_")[2:]) + variable = "_cons" if variable == "const" else variable + stata_val = stata.params[dep].loc[variable, "tstat"] assert_allclose(stata_val, val, rtol=rtol) @@ -67,10 +72,10 @@ def test_pval(fit): for idx in result.pvalues.index: val = result.pvalues[idx] - dep = '_'.join(idx.split('_')[:2]) - variable = '_'.join(idx.split('_')[2:]) - variable = '_cons' if variable == 'const' else variable - stata_val = stata.params[dep].loc[variable, 'pval'] + dep = "_".join(idx.split("_")[:2]) + variable = "_".join(idx.split("_")[2:]) + variable = "_cons" if variable == "const" else variable + stata_val = stata.params[dep].loc[variable, "pval"] assert_allclose(1 + stata_val, 1 + val, rtol=rtol) diff --git a/linearmodels/tests/system/test_covariance.py b/linearmodels/tests/system/test_covariance.py index b7a3d5d6cf..c3706005c4 100644 --- a/linearmodels/tests/system/test_covariance.py +++ b/linearmodels/tests/system/test_covariance.py @@ -17,7 +17,7 @@ from linearmodels.tests.system._utility import generate_3sls_data_v2 covs = [HeteroskedasticCovariance, HomoskedasticCovariance] -names = ['Heteroskedastic', 'Homoskedastic'] +names = ["Heteroskedastic", "Homoskedastic"] @pytest.fixture(params=list(zip(covs, names))) @@ -50,11 +50,11 @@ def gmm_cov(request): return est(x, z, eps, w, sigma=sigma), name -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def cov_data(): data = generate_3sls_data_v2(k=2) mod = IV3SLS(data) - res = mod.fit(cov_type='unadjusted') + res = mod.fit(cov_type="unadjusted") x = mod._x z = mod._z eps = res.resids.values @@ -78,7 +78,7 @@ def _xpxi(x): for j in range(k): if i == j: kx = x[i].shape[1] - xpx[loc:loc + kx, loc:loc + kx] = x[i].T @ x[i] / nobs + xpx[loc : loc + kx, loc : loc + kx] = x[i].T @ x[i] / nobs loc += kx return np.linalg.inv(xpx) @@ -112,7 +112,7 @@ def test_str_repr(cov): assert name in str(est) assert name in est.__repr__() assert str(hex(id(est))) in est.__repr__() - assert 'Debiased: True' in str(est) + assert "Debiased: True" in str(est) def test_gmm_str_repr(gmm_cov): @@ -120,7 +120,7 @@ def test_gmm_str_repr(gmm_cov): assert name in str(est) assert name in est.__repr__() assert str(hex(id(est))) in est.__repr__() - assert 'GMM' in str(est) + assert "GMM" in str(est) def test_homoskedastic_direct(cov_data, debias): @@ -154,7 +154,7 @@ def test_heteroskedastic_direct(cov_data, debias): x, z, eps, sigma = cov_data cov = HeteroskedasticCovariance(x, eps, sigma, sigma, debiased=debias) k = len(x) - xe = [x[i] * eps[:, i:i + 1] for i in range(k)] + xe = [x[i] * eps[:, i : i + 1] for i in range(k)] xe = np.concatenate(xe, 1) nobs = xe.shape[0] xeex = np.zeros((xe.shape[1], xe.shape[1])) @@ -180,10 +180,18 @@ def test_kernel_direct(cov_data, debias): x, z, eps, sigma = cov_data k = len(x) bandwidth = 12 - cov = KernelCovariance(x, eps, sigma, sigma, gls=False, debiased=debias, - kernel='parzen', bandwidth=bandwidth) + cov = KernelCovariance( + x, + eps, + sigma, + sigma, + gls=False, + debiased=debias, + kernel="parzen", + bandwidth=bandwidth, + ) assert cov.bandwidth == 12 - xe = [x[i] * eps[:, i:i + 1] for i in range(k)] + xe = [x[i] * eps[:, i : i + 1] for i in range(k)] xe = np.concatenate(xe, 1) w = kernel_weight_parzen(12) nobs = xe.shape[0] @@ -257,7 +265,7 @@ def test_gmm_heterosedastic_direct(cov_data, debias): xpz = _xpz(x, z) wi = np.linalg.inv(w) xpz_wi = xpz @ wi - ze = [z[i] * eps[:, i:i + 1] for i in range(k)] + ze = [z[i] * eps[:, i : i + 1] for i in range(k)] ze = np.concatenate(ze, 1) zeez = ze.T @ ze / nobs assert_allclose(zeez, cov_est._omega()) @@ -279,16 +287,24 @@ def test_gmm_kernel_direct(cov_data): bandwidth = 12 k = len(x) nobs = x[0].shape[0] - wm = KernelWeightMatrix(kernel='bartlett', bandwidth=bandwidth) + wm = KernelWeightMatrix(kernel="bartlett", bandwidth=bandwidth) w = wm.weight_matrix(x, z, eps, sigma=sigma) - cov_est = GMMKernelCovariance(x, z, eps, w, sigma=sigma, debiased=debias, kernel='bartlett', - bandwidth=bandwidth) + cov_est = GMMKernelCovariance( + x, + z, + eps, + w, + sigma=sigma, + debiased=debias, + kernel="bartlett", + bandwidth=bandwidth, + ) xpz_wi_zpxi = _xpz_wi_zpxi(x, z, w) xpz = _xpz(x, z) wi = np.linalg.inv(w) xpz_wi = xpz @ wi - ze = [z[i] * eps[:, i:i + 1] for i in range(k)] + ze = [z[i] * eps[:, i : i + 1] for i in range(k)] ze = np.concatenate(ze, 1) zeez = ze.T @ ze / nobs w = kernel_weight_bartlett(bandwidth) diff --git a/linearmodels/tests/system/test_equivalence.py b/linearmodels/tests/system/test_equivalence.py index db1c5bcbfe..dd442678fb 100644 --- a/linearmodels/tests/system/test_equivalence.py +++ b/linearmodels/tests/system/test_equivalence.py @@ -7,27 +7,27 @@ def test_gmm_3sls_equiv(): eqns = generate_3sls_data_v2(k=3) gmm = IVSystemGMM(eqns).fit(iter_limit=1) - tsls = IV3SLS(eqns).fit(method='ols') + tsls = IV3SLS(eqns).fit(method="ols") assert_allclose(gmm.params, tsls.params) def test_3sls_2sls_equiv(): eqns = generate_3sls_data_v2(k=1) tsls_mod = IV3SLS(eqns) - tsls = tsls_mod.fit(method='ols', cov_type='unadjusted', debiased=False) + tsls = tsls_mod.fit(method="ols", cov_type="unadjusted", debiased=False) eqn = eqns[list(eqns.keys())[0]] ivmod = IV2SLS(eqn.dependent, eqn.exog, eqn.endog, eqn.instruments) - iv = ivmod.fit(cov_type='unadjusted', debiased=False) + iv = ivmod.fit(cov_type="unadjusted", debiased=False) assert_allclose(iv.params, tsls.params) assert_allclose(iv.tstats, tsls.tstats) assert_allclose(iv.rsquared, tsls.rsquared) - tsls = tsls_mod.fit(method='ols', cov_type='unadjusted', debiased=True) - iv = ivmod.fit(cov_type='unadjusted', debiased=True) + tsls = tsls_mod.fit(method="ols", cov_type="unadjusted", debiased=True) + iv = ivmod.fit(cov_type="unadjusted", debiased=True) assert_allclose(iv.tstats, tsls.tstats) - tsls = tsls_mod.fit(method='ols', cov_type='robust', debiased=False) - iv = ivmod.fit(cov_type='robust', debiased=False) + tsls = tsls_mod.fit(method="ols", cov_type="robust", debiased=False) + iv = ivmod.fit(cov_type="robust", debiased=False) assert_allclose(iv.tstats, tsls.tstats) diff --git a/linearmodels/tests/system/test_formulas.py b/linearmodels/tests/system/test_formulas.py index 1cc89dce46..b2b4d7b9a3 100644 --- a/linearmodels/tests/system/test_formulas.py +++ b/linearmodels/tests/system/test_formulas.py @@ -16,23 +16,28 @@ joined = [] for i, key in enumerate(data): eq = data[key] - joined.append(Series(eq.dependent[:, 0], name='y{0}'.format(i + 1))) + joined.append(Series(eq.dependent[:, 0], name="y{0}".format(i + 1))) for j, col in enumerate(eq.exog.T): - joined.append(Series(col, name='x{0}{1}'.format(i + 1, j + 1))) + joined.append(Series(col, name="x{0}{1}".format(i + 1, j + 1))) k = len(eq.exog.T) for j, col in enumerate(eq.endog.T): - joined.append(Series(col, name='x{0}{1}'.format(i + 1, j + k + 1))) + joined.append(Series(col, name="x{0}{1}".format(i + 1, j + k + 1))) for j, col in enumerate(eq.instruments.T): - joined.append(Series(col, name='z{0}{1}'.format(i + 1, j + 1))) + joined.append(Series(col, name="z{0}{1}".format(i + 1, j + 1))) joined = concat(joined, 1) fmlas = [ - {'eq1': 'y1 ~ x11 + x12', 'eq2': 'y2 ~ x21 + x22'}, - {'eq1': 'y1 ~ 1 + x11 + x12', 'eq2': 'y2 ~ 1 + x21 + x22'}, - {'eq1': 'y1 ~ 1 + x11 + np.exp(x12)', 'eq2': 'y2 ~ 1 + x21 + sigmoid(x22)'}, - {'eq1': 'y1 ~ 1 + x11 + [x14 + x15 ~ z11 + z12 + z13]', 'eq2': 'y2 ~ 1 + x21 + x22'}, - {'eq1': 'y1 ~ [x14 + x15 ~ 1 + x11 + x12 + x13 + z11 + z12 + z13]', - 'eq2': 'y2 ~ x21 + [x24 ~ 1 + z21 + z22 + z23]'} + {"eq1": "y1 ~ x11 + x12", "eq2": "y2 ~ x21 + x22"}, + {"eq1": "y1 ~ 1 + x11 + x12", "eq2": "y2 ~ 1 + x21 + x22"}, + {"eq1": "y1 ~ 1 + x11 + np.exp(x12)", "eq2": "y2 ~ 1 + x21 + sigmoid(x22)"}, + { + "eq1": "y1 ~ 1 + x11 + [x14 + x15 ~ z11 + z12 + z13]", + "eq2": "y2 ~ 1 + x21 + x22", + }, + { + "eq1": "y1 ~ [x14 + x15 ~ 1 + x11 + x12 + x13 + z11 + z12 + z13]", + "eq2": "y2 ~ x21 + [x24 ~ 1 + z21 + z22 + z23]", + }, ] models = ((SUR, sur), (IVSystemGMM, iv_system_gmm), (IV3SLS, iv_3sls)) @@ -41,8 +46,8 @@ ids = [] for f, m in params: - key = '--'.join([value for value in f.values()]) - key += ' : ' + str(m[0].__name__) + key = "--".join([value for value in f.values()]) + key += " : " + str(m[0].__name__) ids.append(key) @@ -50,7 +55,7 @@ def sigmoid(v): return np.exp(v) / (1 + np.exp(v)) -@pytest.fixture(scope='module', params=params, ids=ids) +@pytest.fixture(scope="module", params=params, ids=ids) def config(request): fmla, model_interace = request.param model, interface = model_interace @@ -60,7 +65,7 @@ def config(request): def test_fromula(config): fmla, model, interface = config for key in fmla: - if '[' in fmla[key] and model not in (IVSystemGMM, IV3SLS): + if "[" in fmla[key] and model not in (IVSystemGMM, IV3SLS): return mod = model.from_formula(fmla, joined) mod_fmla = interface(fmla, joined) @@ -72,7 +77,7 @@ def test_fromula(config): def test_predict(config): fmla, model, interface = config for key in fmla: - if '[' in fmla[key] and model not in (IVSystemGMM, IV3SLS): + if "[" in fmla[key] and model not in (IVSystemGMM, IV3SLS): return mod = model.from_formula(fmla, joined) res = mod.fit() @@ -86,7 +91,7 @@ def test_predict(config): def test_predict_partial(config): fmla, model, interface = config for key in fmla: - if '[' in fmla[key] and model not in (IVSystemGMM, IV3SLS): + if "[" in fmla[key] and model not in (IVSystemGMM, IV3SLS): return mod = model.from_formula(fmla, joined) res = mod.fit() @@ -106,7 +111,7 @@ def test_predict_partial(config): for key in list(mod._equations.keys())[1:]: eqns[key] = mod._equations[key] final = list(mod._equations.keys())[0] - eqns[final] = {'exog': None, 'endog': None} + eqns[final] = {"exog": None, "endog": None} pred3 = res.predict(equations=eqns, dataframe=True) assert_frame_equal(pred2[pred3.columns], pred3) @@ -120,7 +125,7 @@ def test_predict_partial(config): def test_invalid_predict(config): fmla, model, interface = config for key in fmla: - if '[' in fmla[key] and model not in (IVSystemGMM, IV3SLS): + if "[" in fmla[key] and model not in (IVSystemGMM, IV3SLS): return mod = model.from_formula(fmla, joined) res = mod.fit() @@ -144,18 +149,18 @@ def test_parser(config): for key in orig_data: eq = orig_data[key] if exog[key] is None: - assert eq['exog'] is None + assert eq["exog"] is None else: - assert_frame_equal(exog[key], eq['exog']) - assert_frame_equal(dep[key], eq['dependent']) + assert_frame_equal(exog[key], eq["exog"]) + assert_frame_equal(dep[key], eq["dependent"]) if endog[key] is None: - assert eq['endog'] is None + assert eq["endog"] is None else: - assert_frame_equal(endog[key], eq['endog']) + assert_frame_equal(endog[key], eq["endog"]) if instr[key] is None: - assert eq['instruments'] is None + assert eq["instruments"] is None else: - assert_frame_equal(instr[key], eq['instruments']) + assert_frame_equal(instr[key], eq["instruments"]) labels = parser.equation_labels for label in labels: diff --git a/linearmodels/tests/system/test_gmm.py b/linearmodels/tests/system/test_gmm.py index 44eb6d2ea0..1907d2b729 100644 --- a/linearmodels/tests/system/test_gmm.py +++ b/linearmodels/tests/system/test_gmm.py @@ -22,37 +22,38 @@ def gen_id(r): - id = 'steps:{0}'.format(r[0]) + id = "steps:{0}".format(r[0]) if r[1]: - id += ',robust' + id += ",robust" else: - id += ',unadjusted' + id += ",unadjusted" return id ids = list(map(gen_id, params)) -@pytest.fixture(scope='module', params=params, ids=ids) +@pytest.fixture(scope="module", params=params, ids=ids) def data(request): steps, robust = request.param - weight_type = 'robust' if robust else 'unadjusted' + weight_type = "robust" if robust else "unadjusted" eqns = generate_3sls_data_v2(k=3) y = [eqns[key].dependent for key in eqns] x = [np.concatenate([eqns[key].exog, eqns[key].endog], 1) for key in eqns] z = [np.concatenate([eqns[key].exog, eqns[key].instruments], 1) for key in eqns] - return AttrDict(eqns=eqns, x=x, y=y, z=z, steps=steps, - robust=robust, weight_type=weight_type) + return AttrDict( + eqns=eqns, x=x, y=y, z=z, steps=steps, robust=robust, weight_type=weight_type + ) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def weight_data(): eqns = generate_3sls_data_v2(k=2) mod = IV3SLS(eqns) x = mod._x z = mod._z - res = mod.fit(cov_type='unadjusted') + res = mod.fit(cov_type="unadjusted") eps = res.resids.values sigma = res.sigma return x, z, eps, sigma @@ -95,7 +96,7 @@ def test_cov(data): def test_formula_equivalence(data): - mod = IVSystemGMM(data.eqns, weight_type='unadjusted') + mod = IVSystemGMM(data.eqns, weight_type="unadjusted") formula = [] df = [] for i, key in enumerate(data.eqns): @@ -104,31 +105,37 @@ def test_formula_equivalence(data): ex = eqn.exog en = eqn.endog instr = eqn.instruments - dep = DataFrame(dep, columns=['dep_{0}'.format(i)]) + dep = DataFrame(dep, columns=["dep_{0}".format(i)]) has_const = False if np.any(np.all(ex == 1, 0)): ex = ex[:, 1:] has_const = True - ex = DataFrame(ex, columns=['ex_{0}_{1}'.format(i, j) for j in range(ex.shape[1])]) - en = DataFrame(en, columns=['en_{0}_{1}'.format(i, j) for j in range(en.shape[1])]) - instr = DataFrame(instr, columns=['instr_{0}_{1}'.format(i, j) - for j in range(ex.shape[1])]) - fmla = ''.join(dep.columns) + ' ~ ' + ex = DataFrame( + ex, columns=["ex_{0}_{1}".format(i, j) for j in range(ex.shape[1])] + ) + en = DataFrame( + en, columns=["en_{0}_{1}".format(i, j) for j in range(en.shape[1])] + ) + instr = DataFrame( + instr, columns=["instr_{0}_{1}".format(i, j) for j in range(ex.shape[1])] + ) + fmla = "".join(dep.columns) + " ~ " if has_const: - fmla += ' 1 + ' - fmla += ' + '.join(ex.columns) + ' + [' - fmla += ' + '.join(en.columns) + ' ~ ' - fmla += ' + '.join(instr.columns) + ' ] ' + fmla += " 1 + " + fmla += " + ".join(ex.columns) + " + [" + fmla += " + ".join(en.columns) + " ~ " + fmla += " + ".join(instr.columns) + " ] " formula.append(fmla) df.extend([dep, ex, en, instr]) from collections import OrderedDict + formulas = OrderedDict() for i, f in enumerate(formula): - formulas['eq{0}'.format(i)] = f + formulas["eq{0}".format(i)] = f df = concat(df, 1) - formula_mod = IVSystemGMM.from_formula(formulas, df, weight_type='unadjusted') - res = mod.fit(cov_type='unadjusted') - formula_res = formula_mod.fit(cov_type='unadjusted') + formula_mod = IVSystemGMM.from_formula(formulas, df, weight_type="unadjusted") + res = mod.fit(cov_type="unadjusted") + formula_res = formula_mod.fit(cov_type="unadjusted") assert_allclose(res.params, formula_res.params) @@ -137,13 +144,13 @@ def test_formula_equivalence_weights(data): eqn_copy = AttrDict() for key in data.eqns: eqn = {k: v for k, v in data.eqns[key].items()} - nobs = eqn['dependent'].shape[0] + nobs = eqn["dependent"].shape[0] w = np.random.chisquare(2, (nobs, 1)) / 2 weights[key] = w - eqn['weights'] = w + eqn["weights"] = w eqn_copy[key] = eqn - mod = IVSystemGMM(eqn_copy, weight_type='unadjusted') + mod = IVSystemGMM(eqn_copy, weight_type="unadjusted") df = [] formulas = OrderedDict() for i, key in enumerate(data.eqns): @@ -152,43 +159,52 @@ def test_formula_equivalence_weights(data): ex = eqn.exog en = eqn.endog instr = eqn.instruments - dep = DataFrame(dep, columns=['dep_{0}'.format(i)]) + dep = DataFrame(dep, columns=["dep_{0}".format(i)]) has_const = False if np.any(np.all(ex == 1, 0)): ex = ex[:, 1:] has_const = True - ex = DataFrame(ex, columns=['ex_{0}_{1}'.format(i, j) for j in range(ex.shape[1])]) - en = DataFrame(en, columns=['en_{0}_{1}'.format(i, j) for j in range(en.shape[1])]) - instr = DataFrame(instr, columns=['instr_{0}_{1}'.format(i, j) - for j in range(ex.shape[1])]) - fmla = ''.join(dep.columns) + ' ~ ' + ex = DataFrame( + ex, columns=["ex_{0}_{1}".format(i, j) for j in range(ex.shape[1])] + ) + en = DataFrame( + en, columns=["en_{0}_{1}".format(i, j) for j in range(en.shape[1])] + ) + instr = DataFrame( + instr, columns=["instr_{0}_{1}".format(i, j) for j in range(ex.shape[1])] + ) + fmla = "".join(dep.columns) + " ~ " if has_const: - fmla += ' 1 + ' - fmla += ' + '.join(ex.columns) + ' + [' - fmla += ' + '.join(en.columns) + ' ~ ' - fmla += ' + '.join(instr.columns) + ' ] ' + fmla += " 1 + " + fmla += " + ".join(ex.columns) + " + [" + fmla += " + ".join(en.columns) + " ~ " + fmla += " + ".join(instr.columns) + " ] " formulas[key] = fmla df.extend([dep, ex, en, instr]) df = concat(df, 1) - formula_mod = IVSystemGMM.from_formula(formulas, df, weights=weights, weight_type='unadjusted') - res = mod.fit(cov_type='unadjusted') - formula_res = formula_mod.fit(cov_type='unadjusted') + formula_mod = IVSystemGMM.from_formula( + formulas, df, weights=weights, weight_type="unadjusted" + ) + res = mod.fit(cov_type="unadjusted") + formula_res = formula_mod.fit(cov_type="unadjusted") assert_allclose(res.params, formula_res.params) def test_weight_options(data): - mod = IVSystemGMM(data.eqns, weight_type='unadjusted', debiased=True, center=True) - res = mod.fit(cov_type='unadjusted') - assert res.weight_config == {'debiased': True, 'center': True} - assert res.weight_type == 'unadjusted' - assert 'Debiased: True' in str(res.summary) + mod = IVSystemGMM(data.eqns, weight_type="unadjusted", debiased=True, center=True) + res = mod.fit(cov_type="unadjusted") + assert res.weight_config == {"debiased": True, "center": True} + assert res.weight_type == "unadjusted" + assert "Debiased: True" in str(res.summary) assert str(hex(id(res._weight_estimtor))) in res._weight_estimtor.__repr__() - assert res._weight_estimtor.config == {'debiased': True, 'center': True} - base_res = IVSystemGMM(data.eqns, weight_type='unadjusted').fit(cov_type='unadjusted') + assert res._weight_estimtor.config == {"debiased": True, "center": True} + base_res = IVSystemGMM(data.eqns, weight_type="unadjusted").fit( + cov_type="unadjusted" + ) assert np.all(np.diag(res.w) >= np.diag(base_res.w)) - mod = IVSystemGMM(data.eqns, weight_type='robust', debiased=True) - res = mod.fit(cov_type='robust') + mod = IVSystemGMM(data.eqns, weight_type="robust", debiased=True) + res = mod.fit(cov_type="robust") def test_no_constant_smoke(): @@ -199,21 +215,23 @@ def test_no_constant_smoke(): def test_unknown_weight_type(data): with pytest.raises(ValueError): - IVSystemGMM(data.eqns, weight_type='unknown') + IVSystemGMM(data.eqns, weight_type="unknown") def test_unknown_cov_type(data): mod = IVSystemGMM(data.eqns) with pytest.raises(ValueError): - mod.fit(cov_type='unknown') + mod.fit(cov_type="unknown") with pytest.raises(ValueError): mod.fit(cov_type=3) def test_initial_weight_matrix(data): mod = IVSystemGMM(data.eqns) - z = [np.concatenate([data.eqns[key].exog, data.eqns[key].instruments], 1) - for key in data.eqns] + z = [ + np.concatenate([data.eqns[key].exog, data.eqns[key].instruments], 1) + for key in data.eqns + ] z = np.concatenate(z, 1) ze = z + np.random.standard_normal(size=z.shape) w0 = ze.T @ ze / ze.shape[0] @@ -225,30 +243,30 @@ def test_initial_weight_matrix(data): def test_summary(data): mod = IVSystemGMM(data.eqns) res = mod.fit() - assert 'Instruments' in res.summary.as_text() - assert 'Weight Estimator' in res.summary.as_text() + assert "Instruments" in res.summary.as_text() + assert "Weight Estimator" in res.summary.as_text() for eq in res.equations: - assert 'Weight Estimator' in res.equations[eq].summary.as_text() - assert 'Instruments' in res.equations[eq].summary.as_text() + assert "Weight Estimator" in res.equations[eq].summary.as_text() + assert "Instruments" in res.equations[eq].summary.as_text() res = mod.fit(iter_limit=10) if res.iterations > 2: - assert 'Iterative System GMM' in res.summary.as_text() + assert "Iterative System GMM" in res.summary.as_text() def test_summary_homoskedastic(data): - mod = IVSystemGMM(data.eqns, weight_type='unadjusted', debiased=True) - res = mod.fit(cov_type='homoskedastic', debiased=True) - assert 'Homoskedastic (Unadjusted) Weighting' in res.summary.as_text() + mod = IVSystemGMM(data.eqns, weight_type="unadjusted", debiased=True) + res = mod.fit(cov_type="homoskedastic", debiased=True) + assert "Homoskedastic (Unadjusted) Weighting" in res.summary.as_text() def test_fixed_sigma(data): - mod = IVSystemGMM(data.eqns, weight_type='unadjusted') - res = mod.fit(cov_type='unadjusted') + mod = IVSystemGMM(data.eqns, weight_type="unadjusted") + res = mod.fit(cov_type="unadjusted") k = len(data.eqns) b = np.random.standard_normal((k, 1)) sigma = b @ b.T + np.diag(np.ones(k)) - mod_sigma = IVSystemGMM(data.eqns, weight_type='unadjusted', sigma=sigma) + mod_sigma = IVSystemGMM(data.eqns, weight_type="unadjusted", sigma=sigma) res_sigma = mod_sigma.fit() assert np.any(res.params != res_sigma.params) assert np.any(res.sigma != res_sigma.sigma) @@ -259,7 +277,7 @@ def test_incorrect_sigma_shape(data): b = np.random.standard_normal((k + 2, 1)) sigma = b @ b.T + np.diag(np.ones(k + 2)) with pytest.raises(ValueError): - IVSystemGMM(data.eqns, weight_type='unadjusted', sigma=sigma) + IVSystemGMM(data.eqns, weight_type="unadjusted", sigma=sigma) def test_invalid_sigma_usage(data): @@ -267,7 +285,7 @@ def test_invalid_sigma_usage(data): b = np.random.standard_normal((k, 1)) sigma = b @ b.T + np.diag(np.ones(k)) with pytest.warns(UserWarning): - IVSystemGMM(data.eqns, weight_type='robust', sigma=sigma) + IVSystemGMM(data.eqns, weight_type="robust", sigma=sigma) def test_j_statistic_direct(data): @@ -290,22 +308,22 @@ def test_linear_constraint(data): def test_kernel_equiv(data): - mod = IVSystemGMM(data.eqns, weight_type='kernel', bandwidth=0) - res = mod.fit(cov_type='kernel', debiased=True, bandwidth=0) - assert 'Kernel (HAC) Weighting' in res.summary.as_text() - rob_mod = IVSystemGMM(data.eqns, weight_type='robust') - rob_res = rob_mod.fit(cov_type='robust', debiased=True) + mod = IVSystemGMM(data.eqns, weight_type="kernel", bandwidth=0) + res = mod.fit(cov_type="kernel", debiased=True, bandwidth=0) + assert "Kernel (HAC) Weighting" in res.summary.as_text() + rob_mod = IVSystemGMM(data.eqns, weight_type="robust") + rob_res = rob_mod.fit(cov_type="robust", debiased=True) assert_allclose(res.tstats, rob_res.tstats) def test_kernel_optimal_bandwidth(data): - mod = IVSystemGMM(data.eqns, weight_type='kernel') - res = mod.fit(cov_type='kernel', debiased=True) + mod = IVSystemGMM(data.eqns, weight_type="kernel") + res = mod.fit(cov_type="kernel", debiased=True) nobs = data.eqns[list(data.eqns.keys())[0]].dependent.shape[0] - assert res.weight_config['bandwidth'] == (nobs - 2) + assert res.weight_config["bandwidth"] == (nobs - 2) - mod = IVSystemGMM(data.eqns, weight_type='kernel', optimal_bw=True) - mod.fit(cov_type='kernel', debiased=True) + mod = IVSystemGMM(data.eqns, weight_type="kernel", optimal_bw=True) + mod.fit(cov_type="kernel", debiased=True) def test_homoskedastic_weight_direct(weight_data, center, debias): @@ -333,7 +351,7 @@ def test_heteroskedastic_weight_direct(weight_data, center, debias): x, z, eps, sigma = weight_data weights = wm.weight_matrix(x, z, eps, sigma=sigma) k = len(z) - ze = [z[i] * eps[:, i:i + 1] for i in range(k)] + ze = [z[i] * eps[:, i : i + 1] for i in range(k)] ze = np.concatenate(ze, 1) if center: ze = ze - ze.mean(0) @@ -350,11 +368,11 @@ def test_heteroskedastic_weight_direct(weight_data, center, debias): def test_kernel_weight_direct(weight_data, center, debias): bandwidth = 12 - wm = KernelWeightMatrix(center, debias, kernel='parzen', bandwidth=bandwidth) + wm = KernelWeightMatrix(center, debias, kernel="parzen", bandwidth=bandwidth) x, z, eps, sigma = weight_data weights = wm.weight_matrix(x, z, eps, sigma=sigma) k = len(z) - ze = [z[i] * eps[:, i:i + 1] for i in range(k)] + ze = [z[i] * eps[:, i : i + 1] for i in range(k)] ze = np.concatenate(ze, 1) if center: ze = ze - ze.mean(0) @@ -380,13 +398,16 @@ def test_fitted(data): for i, key in enumerate(res.equations): eq = res.equations[key] fv = res.fitted_values[key].copy() - fv.name = 'fitted_values' + fv.name = "fitted_values" assert_series_equal(eq.fitted_values, fv) b = eq.params.values direct = mod._x[i] @ b expected.append(direct[:, None]) assert_allclose(eq.fitted_values, direct, atol=1e-8) expected = np.concatenate(expected, 1) - expected = DataFrame(expected, index=mod._dependent[i].pandas.index, - columns=[key for key in res.equations]) + expected = DataFrame( + expected, + index=mod._dependent[i].pandas.index, + columns=[key for key in res.equations], + ) assert_frame_equal(expected, res.fitted_values) diff --git a/linearmodels/tests/system/test_sur.py b/linearmodels/tests/system/test_sur.py index d93aad6f97..bbcacdf3a8 100644 --- a/linearmodels/tests/system/test_sur.py +++ b/linearmodels/tests/system/test_sur.py @@ -27,12 +27,12 @@ def gen_id(param): - idstr = 'homo' if isinstance(param[0], list) else 'hetero' - idstr += '-const' if param[1] else '' - idstr += '-correl' if param[2] != 0 else '' - idstr += '-common' if param[3] else '' - idstr += '-weights' if param[4] else '' - idstr += '-dist' if param[4] else '-tuple' + idstr = "homo" if isinstance(param[0], list) else "hetero" + idstr += "-const" if param[1] else "" + idstr += "-correl" if param[2] != 0 else "" + idstr += "-common" if param[3] else "" + idstr += "-weights" if param[4] else "" + idstr += "-dist" if param[4] else "-tuple" return idstr @@ -48,9 +48,9 @@ def check_results(res1, res2): assert_allclose(res1.wresids, res2.wresids) assert_allclose(res1.tstats, res2.tstats) assert_allclose(res1.std_errors, res2.std_errors) - if hasattr(res1, 'rsquared_adj'): + if hasattr(res1, "rsquared_adj"): assert_allclose(res1.rsquared_adj, res2.rsquared_adj) - if hasattr(res1, 'f_statistic'): + if hasattr(res1, "f_statistic"): assert_allclose(res1.f_statistic.stat, res2.f_statistic.stat) if res2.f_statistic.df_denom is None: # Do not test case of F dist due to DOF differences @@ -58,7 +58,7 @@ def check_results(res1, res2): def get_res(res): - d = filter(lambda s: not s.startswith('_'), dir(res)) + d = filter(lambda s: not s.startswith("_"), dir(res)) for attr in d: value = getattr(res, attr) if isinstance(value, Mapping): @@ -71,19 +71,24 @@ def data(request): p, const, rho, common_exog, included_weights, output_dict = request.param if common_exog and isinstance(p, list): p = 3 - return generate_data(p=p, const=const, rho=rho, - common_exog=common_exog, included_weights=included_weights, - output_dict=output_dict) + return generate_data( + p=p, + const=const, + rho=rho, + common_exog=common_exog, + included_weights=included_weights, + output_dict=output_dict, + ) -@pytest.fixture(scope='module', params=[0, 0.1]) +@pytest.fixture(scope="module", params=[0, 0.1]) def missing_data(request): eqns = generate_data() np.random.seed(12345) missing = np.random.random_sample(500) missing = missing < request.param for key in eqns: - eqns[key]['dependent'][missing] = np.nan + eqns[key]["dependent"][missing] = np.nan return eqns @@ -91,46 +96,51 @@ def missing_data(request): def gen_id(param): - idstr = 'const' if param[0] else '' - idstr += '-correl' if param[1] != 0 else '' - idstr += '-weights' if param[2] else '' + idstr = "const" if param[0] else "" + idstr += "-correl" if param[1] != 0 else "" + idstr += "-weights" if param[2] else "" return idstr ids = list(map(gen_id, params)) -@pytest.fixture(scope='module', params=params, ids=ids) +@pytest.fixture(scope="module", params=params, ids=ids) def mvreg_data(request): const, rho, included_weights = request.param - values = generate_data(const=const, rho=rho, - common_exog=True, included_weights=included_weights) + values = generate_data( + const=const, rho=rho, common_exog=True, included_weights=included_weights + ) dep = [] for key in values: - exog = values[key]['exog'] - dep.append(values[key]['dependent']) + exog = values[key]["exog"] + dep.append(values[key]["dependent"]) return np.hstack(dep), exog -kernels = ['bartlett', 'newey-west', 'parzen', 'gallant', 'qs', 'andrews'] +kernels = ["bartlett", "newey-west", "parzen", "gallant", "qs", "andrews"] bandwidths = [None, 0, 10] debiased = [True, False] params = list(product(kernels, bandwidths, debiased)) -ids = list(map(lambda p: p[0] + ', BW: ' + str(p[1]) + ', Debiased: ' + str(p[2]), params)) +ids = list( + map(lambda p: p[0] + ", BW: " + str(p[1]) + ", Debiased: " + str(p[2]), params) +) @pytest.fixture(params=params, ids=ids) def kernel_options(request): - return {'kernel': request.param[0], - 'bandwidth': request.param[1], - 'debiased': request.param[2]} + return { + "kernel": request.param[0], + "bandwidth": request.param[1], + "debiased": request.param[2], + } def test_smoke(data): mod = SUR(data) mod.fit() - mod.fit(cov_type='unadjusted') - mod.fit(cov_type='unadjusted', method='ols') + mod.fit(cov_type="unadjusted") + mod.fit(cov_type="unadjusted", method="ols") res = mod.fit(full_cov=False) get_res(res) @@ -140,28 +150,39 @@ def test_errors(): with pytest.raises(TypeError): SUR([]) with pytest.raises(TypeError): - SUR({'a': 'absde', 'b': 12345}) - - moddata = {'a': {'dependent': np.random.standard_normal((100, 1)), - 'exog': np.random.standard_normal((100, 5))}} + SUR({"a": "absde", "b": 12345}) + + moddata = { + "a": { + "dependent": np.random.standard_normal((100, 1)), + "exog": np.random.standard_normal((100, 5)), + } + } with pytest.raises(ValueError): mod = SUR(moddata) - mod.fit(cov_type='unknown') - - moddata = {'a': {'dependent': np.random.standard_normal((100, 1)), - 'exog': np.random.standard_normal((101, 5))}} + mod.fit(cov_type="unknown") + + moddata = { + "a": { + "dependent": np.random.standard_normal((100, 1)), + "exog": np.random.standard_normal((101, 5)), + } + } with pytest.raises(ValueError): SUR(moddata) - moddata = {'a': {'dependent': np.random.standard_normal((10, 1)), - 'exog': np.random.standard_normal((10, 20))}} + moddata = { + "a": { + "dependent": np.random.standard_normal((10, 1)), + "exog": np.random.standard_normal((10, 20)), + } + } with pytest.raises(ValueError): SUR(moddata) x = np.random.standard_normal((100, 2)) x = np.c_[x, x] - moddata = {'a': {'dependent': np.random.standard_normal((100, 1)), - 'exog': x}} + moddata = {"a": {"dependent": np.random.standard_normal((100, 1)), "exog": x}} with pytest.raises(ValueError): SUR(moddata) @@ -170,70 +191,72 @@ def test_mv_reg_smoke(mvreg_data): dependent, exog = mvreg_data mod = SUR.multivariate_ls(dependent, exog) mod.fit() - mod.fit(cov_type='unadjusted') - res = mod.fit(cov_type='unadjusted', method='ols') - assert res.method == 'OLS' + mod.fit(cov_type="unadjusted") + res = mod.fit(cov_type="unadjusted", method="ols") + assert res.method == "OLS" res = mod.fit(full_cov=False) get_res(res) def test_formula(): - data = DataFrame(np.random.standard_normal((500, 4)), - columns=['y1', 'y2', 'x1', 'x2']) - formula = {'eq1': 'y1 ~ 1 + x1', 'eq2': 'y2 ~ 1 + x2'} + data = DataFrame( + np.random.standard_normal((500, 4)), columns=["y1", "y2", "x1", "x2"] + ) + formula = {"eq1": "y1 ~ 1 + x1", "eq2": "y2 ~ 1 + x2"} mod = SUR.from_formula(formula, data) mod.fit() - formula = '{y1 ~ 1 + x1} {y2 ~ 1 + x2}' + formula = "{y1 ~ 1 + x1} {y2 ~ 1 + x2}" mod = SUR.from_formula(formula, data) - mod.fit(cov_type='heteroskedastic') + mod.fit(cov_type="heteroskedastic") - formula = ''' + formula = """ {y1 ~ 1 + x1} {y2 ~ 1 + x2} - ''' + """ mod = SUR.from_formula(formula, data) - mod.fit(cov_type='heteroskedastic') + mod.fit(cov_type="heteroskedastic") - formula = ''' + formula = """ {eq.a:y1 ~ 1 + x1} {second: y2 ~ 1 + x2} - ''' + """ mod = SUR.from_formula(formula, data) - res = mod.fit(cov_type='heteroskedastic') - assert 'eq.a' in res.equation_labels - assert 'second' in res.equation_labels + res = mod.fit(cov_type="heteroskedastic") + assert "eq.a" in res.equation_labels + assert "second" in res.equation_labels # TODO: Implement weights # TODO: 1. MV OLS and OLS (weighted) homo and hetero # TODO: Implement observation dropping and check + def test_mv_ols_equivalence(mvreg_data): dependent, exog = mvreg_data mod = SUR.multivariate_ls(dependent, exog) - res = mod.fit(cov_type='unadjusted') + res = mod.fit(cov_type="unadjusted") keys = res.equation_labels - assert res.method == 'OLS' + assert res.method == "OLS" for i in range(dependent.shape[1]): ols_mod = OLS(dependent[:, i], exog) - ols_res = ols_mod.fit(cov_type='unadjusted', debiased=False) + ols_res = ols_mod.fit(cov_type="unadjusted", debiased=False) mv_res = res.equations[keys[i]] - assert mv_res.method == 'OLS' + assert mv_res.method == "OLS" check_results(mv_res, ols_res) def test_mv_ols_equivalence_robust(mvreg_data): dependent, exog = mvreg_data mod = SUR.multivariate_ls(dependent, exog) - res = mod.fit(cov_type='robust') + res = mod.fit(cov_type="robust") keys = res.equation_labels for i in range(dependent.shape[1]): ols_mod = OLS(dependent[:, i], exog) - ols_res = ols_mod.fit(cov_type='robust', debiased=False) + ols_res = ols_mod.fit(cov_type="robust", debiased=False) mv_res = res.equations[keys[i]] check_results(mv_res, ols_res) @@ -241,12 +264,12 @@ def test_mv_ols_equivalence_robust(mvreg_data): def test_mv_ols_equivalence_debiased(mvreg_data): dependent, exog = mvreg_data mod = SUR.multivariate_ls(dependent, exog) - res = mod.fit(cov_type='unadjusted', debiased=True) + res = mod.fit(cov_type="unadjusted", debiased=True) keys = res.equation_labels for i in range(dependent.shape[1]): ols_mod = OLS(dependent[:, i], exog) - ols_res = ols_mod.fit(cov_type='unadjusted', debiased=True) + ols_res = ols_mod.fit(cov_type="unadjusted", debiased=True) mv_res = res.equations[keys[i]] check_results(mv_res, ols_res) @@ -254,12 +277,12 @@ def test_mv_ols_equivalence_debiased(mvreg_data): def test_mv_ols_equivalence_hetero_debiased(mvreg_data): dependent, exog = mvreg_data mod = SUR.multivariate_ls(dependent, exog) - res = mod.fit(cov_type='robust', debiased=True) + res = mod.fit(cov_type="robust", debiased=True) keys = res.equation_labels for i in range(dependent.shape[1]): ols_mod = OLS(dependent[:, i], exog) - ols_res = ols_mod.fit(cov_type='robust', debiased=True) + ols_res = ols_mod.fit(cov_type="robust", debiased=True) mv_res = res.equations[keys[i]] check_results(mv_res, ols_res) @@ -272,12 +295,11 @@ def test_gls_eye_mv_ols_equiv(mvreg_data): ad = AttrDict() for i in range(dependent.shape[1]): - key = 'dependent.{0}'.format(i) + key = "dependent.{0}".format(i) df = DataFrame(dependent[:, [i]], columns=[key]) - ad[key] = {'dependent': df, - 'exog': exog.copy()} + ad[key] = {"dependent": df, "exog": exog.copy()} gls_mod = SUR(ad, sigma=np.eye(len(ad))) - gls_res = gls_mod.fit(method='gls') + gls_res = gls_mod.fit(method="gls") check_results(mv_res, gls_res) for i in range(dependent.shape[1]): @@ -285,8 +307,8 @@ def test_gls_eye_mv_ols_equiv(mvreg_data): gls_res_eq = gls_res.equations[keys[i]] check_results(mv_res_eq, gls_res_eq) - mv_res = mv_mod.fit(cov_type='robust') - gls_res = gls_mod.fit(cov_type='robust', method='gls') + mv_res = mv_mod.fit(cov_type="robust") + gls_res = gls_mod.fit(cov_type="robust", method="gls") check_results(mv_res, gls_res) for i in range(dependent.shape[1]): @@ -294,8 +316,8 @@ def test_gls_eye_mv_ols_equiv(mvreg_data): gls_res_eq = gls_res.equations[keys[i]] check_results(mv_res_eq, gls_res_eq) - mv_res = mv_mod.fit(cov_type='robust', debiased=True) - gls_res = gls_mod.fit(cov_type='robust', method='gls', debiased=True) + mv_res = mv_mod.fit(cov_type="robust", debiased=True) + gls_res = gls_mod.fit(cov_type="robust", method="gls", debiased=True) check_results(mv_res, gls_res) for i in range(dependent.shape[1]): @@ -312,12 +334,11 @@ def test_gls_without_mv_ols_equiv(mvreg_data): ad = AttrDict() for i in range(dependent.shape[1]): - key = 'dependent.{0}'.format(i) + key = "dependent.{0}".format(i) df = DataFrame(dependent[:, [i]], columns=[key]) - ad[key] = {'dependent': df, - 'exog': exog.copy()} + ad[key] = {"dependent": df, "exog": exog.copy()} gls_mod = SUR(ad) - gls_res = gls_mod.fit(method='ols') + gls_res = gls_mod.fit(method="ols") check_results(mv_res, gls_res) for i in range(dependent.shape[1]): @@ -325,8 +346,8 @@ def test_gls_without_mv_ols_equiv(mvreg_data): gls_res_eq = gls_res.equations[keys[i]] check_results(mv_res_eq, gls_res_eq) - mv_res = mv_mod.fit(cov_type='robust') - gls_res = gls_mod.fit(cov_type='robust', method='ols') + mv_res = mv_mod.fit(cov_type="robust") + gls_res = gls_mod.fit(cov_type="robust", method="ols") check_results(mv_res, gls_res) for i in range(dependent.shape[1]): @@ -334,8 +355,8 @@ def test_gls_without_mv_ols_equiv(mvreg_data): gls_res_eq = gls_res.equations[keys[i]] check_results(mv_res_eq, gls_res_eq) - mv_res = mv_mod.fit(cov_type='robust', debiased=True) - gls_res = gls_mod.fit(cov_type='robust', method='ols', debiased=True) + mv_res = mv_mod.fit(cov_type="robust", debiased=True) + gls_res = gls_mod.fit(cov_type="robust", method="ols", debiased=True) check_results(mv_res, gls_res) for i in range(dependent.shape[1]): @@ -346,18 +367,18 @@ def test_gls_without_mv_ols_equiv(mvreg_data): def test_ols_against_gls(data): mod = SUR(data) - res = mod.fit(method='gls') + res = mod.fit(method="gls") sigma = res.sigma sigma_m12 = inv_matrix_sqrt(sigma) key = list(data.keys())[0] if isinstance(data[key], Mapping): - y = [data[key]['dependent'] for key in data] - x = [data[key]['exog'] for key in data] + y = [data[key]["dependent"] for key in data] + x = [data[key]["exog"] for key in data] try: - w = [data[key]['weights'] for key in data] + w = [data[key]["weights"] for key in data] except KeyError: - w = [np.ones_like(data[key]['dependent']) for key in data] + w = [np.ones_like(data[key]["dependent"]) for key in data] else: y = [data[key][0] for key in data] x = [data[key][1] for key in data] @@ -388,17 +409,17 @@ def test_constraint_setting(data): q = Series([0, 1], index=r.index) mod.add_constraints(r) - mod.fit(method='ols') - res = mod.fit(method='ols', cov_type='unadjusted') + mod.fit(method="ols") + res = mod.fit(method="ols", cov_type="unadjusted") assert_allclose(r.values @ res.params.values[:, None], np.zeros((2, 1)), atol=1e-8) - mod.fit(method='gls') - res = mod.fit(method='gls', cov_type='unadjusted') + mod.fit(method="gls") + res = mod.fit(method="gls", cov_type="unadjusted") assert_allclose(r.values @ res.params.values[:, None], np.zeros((2, 1)), atol=1e-8) mod.add_constraints(r, q) - res = mod.fit(method='ols') + res = mod.fit(method="ols") assert_allclose(r.values @ res.params.values[:, None], q.values[:, None], atol=1e-8) - res = mod.fit(method='gls') + res = mod.fit(method="gls") assert_allclose(r.values @ res.params.values[:, None], q.values[:, None], atol=1e-8) @@ -457,44 +478,47 @@ def test_missing(data): primes = [11, 13, 17, 19, 23] for i, key in enumerate(data): if isinstance(data[key], Mapping): - data[key]['dependent'][::primes[i % 5]] = np.nan + data[key]["dependent"][:: primes[i % 5]] = np.nan else: - data[key][0][::primes[i % 5]] = np.nan + data[key][0][:: primes[i % 5]] = np.nan with warnings.catch_warnings(record=True) as w: SUR(data) assert len(w) == 1 - assert 'missing' in w[0].message.args[0] + assert "missing" in w[0].message.args[0] def test_formula_errors(): - data = DataFrame(np.random.standard_normal((500, 4)), - columns=['y1', 'y2', 'x1', 'x2']) + data = DataFrame( + np.random.standard_normal((500, 4)), columns=["y1", "y2", "x1", "x2"] + ) with pytest.raises(TypeError): SUR.from_formula(np.ones(10), data) def test_formula_repeated_key(): - data = DataFrame(np.random.standard_normal((500, 4)), - columns=['y1', 'y2', 'x1', 'x2']) + data = DataFrame( + np.random.standard_normal((500, 4)), columns=["y1", "y2", "x1", "x2"] + ) - formula = ''' + formula = """ {first:y1 ~ 1 + x1} {first: y2 ~ 1 + x2} - ''' + """ mod = SUR.from_formula(formula, data) res = mod.fit() - assert 'first' in res.equation_labels - assert 'first.0' in res.equation_labels + assert "first" in res.equation_labels + assert "first.0" in res.equation_labels def test_formula_weights(): - data = DataFrame(np.random.standard_normal((500, 4)), - columns=['y1', 'y2', 'x1', 'x2']) - weights = DataFrame(np.random.chisquare(5, (500, 2)), columns=['eq1', 'eq2']) + data = DataFrame( + np.random.standard_normal((500, 4)), columns=["y1", "y2", "x1", "x2"] + ) + weights = DataFrame(np.random.chisquare(5, (500, 2)), columns=["eq1", "eq2"]) formula = OrderedDict() - formula['eq1'] = 'y1 ~ 1 + x1' - formula['eq2'] = 'y2 ~ 1 + x1' + formula["eq1"] = "y1 ~ 1 + x1" + formula["eq2"] = "y2 ~ 1 + x1" mod = SUR.from_formula(formula, data, weights=weights) mod.fit() expected = weights.values[:, [0]] @@ -502,8 +526,8 @@ def test_formula_weights(): expected = weights.values[:, [1]] assert_allclose(mod._w[1], expected / expected.mean()) - formula = '{y1 ~ 1 + x1} {y2 ~ 1 + x2}' - weights = DataFrame(np.random.chisquare(5, (500, 2)), columns=['y1', 'y2']) + formula = "{y1 ~ 1 + x1} {y2 ~ 1 + x2}" + weights = DataFrame(np.random.chisquare(5, (500, 2)), columns=["y1", "y2"]) mod = SUR.from_formula(formula, data, weights=weights) mod.fit() expected = weights.values[:, [0]] @@ -513,31 +537,32 @@ def test_formula_weights(): def test_formula_partial_weights(): - data = DataFrame(np.random.standard_normal((500, 4)), - columns=['y1', 'y2', 'x1', 'x2']) - weights = DataFrame(np.random.chisquare(5, (500, 1)), columns=['eq2']) + data = DataFrame( + np.random.standard_normal((500, 4)), columns=["y1", "y2", "x1", "x2"] + ) + weights = DataFrame(np.random.chisquare(5, (500, 1)), columns=["eq2"]) formula = OrderedDict() - formula['eq1'] = 'y1 ~ 1 + x1' - formula['eq2'] = 'y2 ~ 1 + x1' + formula["eq1"] = "y1 ~ 1 + x1" + formula["eq2"] = "y2 ~ 1 + x1" with warnings.catch_warnings(record=True) as w: mod = SUR.from_formula(formula, data, weights=weights) assert len(w) == 1 - assert 'Weights' in w[0].message.args[0] - assert 'eq1' in w[0].message.args[0] - assert 'eq2' not in w[0].message.args[0] + assert "Weights" in w[0].message.args[0] + assert "eq1" in w[0].message.args[0] + assert "eq2" not in w[0].message.args[0] mod.fit() expected = np.ones((500, 1)) assert_allclose(mod._w[0], expected / expected.mean()) expected = weights.values[:, [0]] assert_allclose(mod._w[1], expected / expected.mean()) - formula = '{y1 ~ 1 + x1} {y2 ~ 1 + x2}' - weights = DataFrame(np.random.chisquare(5, (500, 1)), columns=['y2']) + formula = "{y1 ~ 1 + x1} {y2 ~ 1 + x2}" + weights = DataFrame(np.random.chisquare(5, (500, 1)), columns=["y2"]) with warnings.catch_warnings(record=True) as w: mod = SUR.from_formula(formula, data, weights=weights) assert len(w) == 1 - assert 'y1' in w[0].message.args[0] - assert 'y2' not in w[0].message.args[0] + assert "y1" in w[0].message.args[0] + assert "y2" not in w[0].message.args[0] expected = np.ones((500, 1)) assert_allclose(mod._w[0], expected / expected.mean()) @@ -555,22 +580,22 @@ def test_against_direct_model(data): keys = list(data.keys()) if not isinstance(data[keys[0]], Mapping): return - if 'weights' in data[keys[0]]: + if "weights" in data[keys[0]]: return y = [] x = [] data_copy = OrderedDict() for i in range(min(3, len(data))): data_copy[keys[i]] = data[keys[i]] - y.append(data[keys[i]]['dependent']) - x.append(data[keys[i]]['exog']) + y.append(data[keys[i]]["dependent"]) + x.append(data[keys[i]]["exog"]) direct = simple_sur(y, x) mod = SUR(data_copy) - res = mod.fit(method='ols') + res = mod.fit(method="ols") assert_allclose(res.params.values[:, None], direct.beta0) - res = mod.fit(method='gls') + res = mod.fit(method="gls") assert_allclose(res.params.values[:, None], direct.beta1) @@ -590,33 +615,45 @@ def test_model_repr(data): repr = mod.__repr__() assert str(len(data)) in repr assert str(hex(id(mod))) in repr - assert 'Seemingly Unrelated Regression (SUR)' in repr + assert "Seemingly Unrelated Regression (SUR)" in repr def test_mv_ols_hac_smoke(kernel_options): - data = generate_data(p=3, const=True, rho=0.8, common_exog=False, - included_weights=False, output_dict=True) + data = generate_data( + p=3, + const=True, + rho=0.8, + common_exog=False, + included_weights=False, + output_dict=True, + ) mod = SUR(data) - res = mod.fit(cov_type='kernel', **kernel_options) - assert 'Kernel (HAC) ' in str(res) - assert 'Kernel: {0}'.format(kernel_options['kernel']) in str(res) - if kernel_options['bandwidth'] == 0: - res_base = mod.fit(cov_type='robust', debiased=kernel_options['debiased']) + res = mod.fit(cov_type="kernel", **kernel_options) + assert "Kernel (HAC) " in str(res) + assert "Kernel: {0}".format(kernel_options["kernel"]) in str(res) + if kernel_options["bandwidth"] == 0: + res_base = mod.fit(cov_type="robust", debiased=kernel_options["debiased"]) assert_allclose(res.tstats, res_base.tstats) def test_invalid_kernel_options(kernel_options): - data = generate_data(p=3, const=True, rho=0.8, common_exog=False, - included_weights=False, output_dict=True) + data = generate_data( + p=3, + const=True, + rho=0.8, + common_exog=False, + included_weights=False, + output_dict=True, + ) mod = SUR(data) with pytest.raises(TypeError): ko = {k: v for k, v in kernel_options.items()} - ko['bandwidth'] = 'None' - mod.fit(cov_type='kernel', **ko) + ko["bandwidth"] = "None" + mod.fit(cov_type="kernel", **ko) with pytest.raises(TypeError): ko = {k: v for k, v in kernel_options.items()} - ko['kernel'] = 1 - mod.fit(cov_type='kernel', **ko) + ko["kernel"] = 1 + mod.fit(cov_type="kernel", **ko) def test_fitted(data): @@ -626,29 +663,35 @@ def test_fitted(data): for i, key in enumerate(res.equations): eq = res.equations[key] fv = res.fitted_values[key].copy() - fv.name = 'fitted_values' + fv.name = "fitted_values" assert_series_equal(eq.fitted_values, fv) b = eq.params.values direct = mod._x[i] @ b expected.append(direct[:, None]) assert_allclose(eq.fitted_values, direct, atol=1e-8) expected = np.concatenate(expected, 1) - expected = DataFrame(expected, index=mod._dependent[i].pandas.index, - columns=[key for key in res.equations]) + expected = DataFrame( + expected, + index=mod._dependent[i].pandas.index, + columns=[key for key in res.equations], + ) assert_frame_equal(expected, res.fitted_values) -@pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +@pytest.mark.filterwarnings("ignore::linearmodels.utility.MissingValueWarning") def test_predict(missing_data): mod = SUR(missing_data) res = mod.fit() pred = res.predict() for key in pred: - assert_series_equal(pred[key].iloc[:, 0], res.equations[key].fitted_values, - check_names=False) + assert_series_equal( + pred[key].iloc[:, 0], res.equations[key].fitted_values, check_names=False + ) pred = res.predict(fitted=False, idiosyncratic=True) for key in pred: - assert_series_equal(pred[key].iloc[:, 0], res.equations[key].resids, check_names=False) + assert_series_equal( + pred[key].iloc[:, 0], res.equations[key].resids, check_names=False + ) pred = res.predict(fitted=True, idiosyncratic=True) assert isinstance(pred, dict) for key in res.equations: @@ -662,12 +705,12 @@ def test_predict(missing_data): assert_frame_equal(pred, res.resids) pred = res.predict(fitted=True, idiosyncratic=True, dataframe=True) assert isinstance(pred, dict) - assert 'fitted_values' in pred - assert_frame_equal(pred['fitted_values'], res.fitted_values) - assert 'idiosyncratic' in pred - assert_frame_equal(pred['idiosyncratic'], res.resids) + assert "fitted_values" in pred + assert_frame_equal(pred["fitted_values"], res.fitted_values) + assert "idiosyncratic" in pred + assert_frame_equal(pred["idiosyncratic"], res.resids) - nobs = missing_data[list(missing_data.keys())[0]]['dependent'].shape[0] + nobs = missing_data[list(missing_data.keys())[0]]["dependent"].shape[0] pred = res.predict(fitted=True, idiosyncratic=False, dataframe=True, missing=True) assert pred.shape[0] == nobs @@ -676,9 +719,55 @@ def test_predict(missing_data): assert pred[key].shape[0] == nobs -@pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +@pytest.mark.filterwarnings("ignore::linearmodels.utility.MissingValueWarning") def test_predict_error(missing_data): mod = SUR(missing_data) res = mod.fit() with pytest.raises(ValueError): res.predict(fitted=False, idiosyncratic=False) + + +def reference_mcelroy(u, y, sigma): + u = np.asarray(u) + nobs = u.shape[0] + sigma = np.asarray(sigma) + y = np.asarray(y) + u = u.T.ravel() + y = y.T.ravel() + sigma_inv = np.linalg.inv(sigma) + omega_inv = np.kron(sigma_inv, np.eye(nobs)) + num = u @ omega_inv @ u + iota = np.ones((nobs, 1)) + core = np.kron(sigma_inv, np.eye(nobs) - iota @ iota.T / nobs) + denom = y @ core @ y + + return 1 - num / denom + + +def reference_berndt(u, y): + u = np.asarray(u) + nobs = u.shape[0] + num = np.linalg.det(u.T @ u / nobs) + y = np.asarray(y) + mu = y.mean(0) + y = y - mu + denom = np.linalg.det(y.T @ y / nobs) + return 1 - num / denom + + +def test_system_r2_direct(): + eqns = generate_data(k=3) + mod = SUR(eqns) + res = mod.fit(method="ols", cov_type="unadjusted") + y = np.hstack([eqns[eq]["dependent"] for eq in eqns]) + ref = reference_mcelroy(res.resids, y, res.sigma) + assert_allclose(ref, res.system_rsquared.mcelroy) + ref = reference_berndt(res.resids, y) + assert_allclose(ref, res.system_rsquared.berndt) + + res = mod.fit(method="gls", cov_type="unadjusted", iter_limit=100) + y = np.hstack([eqns[eq]["dependent"] for eq in eqns]) + ref = reference_mcelroy(res.resids, y, res.sigma) + assert_allclose(ref, res.system_rsquared.mcelroy) + ref = reference_berndt(res.resids, y) + assert_allclose(ref, res.system_rsquared.berndt, atol=1e-3, rtol=1e-3) diff --git a/linearmodels/tests/system/test_sur_against_stata.py b/linearmodels/tests/system/test_sur_against_stata.py index 8bf1599232..a43633d86d 100644 --- a/linearmodels/tests/system/test_sur_against_stata.py +++ b/linearmodels/tests/system/test_sur_against_stata.py @@ -12,42 +12,44 @@ from linearmodels.tests.system.results.parse_stata_results import stata_results from linearmodels.utility import AttrDict -pytestmark = pytest.mark.filterwarnings('ignore::linearmodels.utility.MissingValueWarning') +pytestmark = pytest.mark.filterwarnings( + "ignore::linearmodels.utility.MissingValueWarning" +) -@pytest.fixture(scope='module', params=list(stata_results.keys())) +@pytest.fixture(scope="module", params=list(stata_results.keys())) def model_data(request): key = request.param - dgp, model_type = key.split('-') - if dgp == 'basic': + dgp, model_type = key.split("-") + if dgp == "basic": data = basic_data - elif dgp == 'common': + elif dgp == "common": data = common_data for i, data_key in enumerate(data): if i == 0: - exog = data[data_key]['exog'] + exog = data[data_key]["exog"] else: - data[data_key]['exog'] = exog + data[data_key]["exog"] = exog else: # dgp == 'missing' data = missing_data - cov_kwds = {'cov_type': 'unadjusted'} - if model_type == 'ss': - cov_kwds['debiased'] = True + cov_kwds = {"cov_type": "unadjusted"} + if model_type == "ss": + cov_kwds["debiased"] = True stata_result = stata_results[key] rekeyed_data = OrderedDict() for data_key in data: temp = data[data_key] - new_key = temp['dependent'].columns[0] + new_key = temp["dependent"].columns[0] rekeyed_data[new_key] = temp constraint = None - if model_type == 'constrained': + if model_type == "constrained": cols = [] widths = [] for new_key in rekeyed_data: - exog = rekeyed_data[new_key]['exog'] - cols.extend([new_key + '_' + col for col in exog.columns]) + exog = rekeyed_data[new_key]["exog"] + cols.extend([new_key + "_" + col for col in exog.columns]) widths.append(exog.shape[1]) - r = pd.DataFrame(columns=cols, index=['r0', 'r1'], dtype=np.float64) + r = pd.DataFrame(columns=cols, index=["r0", "r1"], dtype=np.float64) r.iloc[:, :] = 0.0 r.iloc[:, 0] = -1.0 r.iloc[0, widths[0]] = 1.0 @@ -59,9 +61,16 @@ def model_data(request): mod.add_constraints(constraint) res = mod.fit(**cov_kwds) - return AttrDict(data=rekeyed_data, cov_kwds=cov_kwds, model_type=model_type, - stata_result=stata_result, key=key, constraint=constraint, - mod=mod, res=res) + return AttrDict( + data=rekeyed_data, + cov_kwds=cov_kwds, + model_type=model_type, + stata_result=stata_result, + key=key, + constraint=constraint, + mod=mod, + res=res, + ) def test_params(model_data): @@ -105,12 +114,12 @@ def test_f_stat(model_data): for i, key in enumerate(res.equations): eq = res.equations[key] stat = eq.f_statistic.stat - stata_stat = stata_stats.loc['F_{0}'.format(i + 1)].squeeze() + stata_stat = stata_stats.loc["F_{0}".format(i + 1)].squeeze() if np.isnan(stata_stat): - stata_stat = stata_stats.loc['chi2_{0}'.format(i + 1)].squeeze() + stata_stat = stata_stats.loc["chi2_{0}".format(i + 1)].squeeze() assert_allclose(stat, stata_stat) pval = eq.f_statistic.pval - stata_pval = stata_stats.loc['p_{0}'.format(i + 1)] + stata_pval = stata_stats.loc["p_{0}".format(i + 1)] assert_allclose(pval, stata_pval, atol=1e-6) @@ -120,7 +129,7 @@ def test_r2(model_data): for i, key in enumerate(res.equations): eq = res.equations[key] stat = eq.rsquared - stata_stat = stata_stats.loc['r2_{0}'.format(i + 1)].squeeze() + stata_stat = stata_stats.loc["r2_{0}".format(i + 1)].squeeze() assert_allclose(stat, stata_stat) @@ -130,9 +139,9 @@ def test_sum_of_squares(model_data): for i, key in enumerate(res.equations): eq = res.equations[key] stat = eq.resid_ss - stata_stat = stata_stats.loc['rss_{0}'.format(i + 1)].squeeze() + stata_stat = stata_stats.loc["rss_{0}".format(i + 1)].squeeze() assert_allclose(stat, stata_stat) - stata_stat = stata_stats.loc['mss_{0}'.format(i + 1)].squeeze() + stata_stat = stata_stats.loc["mss_{0}".format(i + 1)].squeeze() stat = eq.model_ss assert_allclose(stat, stata_stat) @@ -143,5 +152,5 @@ def test_df_model(model_data): for i, key in enumerate(res.equations): eq = res.equations[key] stat = eq.df_model - stata_stat = stata_stats.loc['df_m{0}'.format(i + 1)].squeeze() + stata_stat = stata_stats.loc["df_m{0}".format(i + 1)].squeeze() assert_allclose(stat, stata_stat + 1) diff --git a/linearmodels/tests/system/test_utility.py b/linearmodels/tests/system/test_utility.py index 4a5415ab2c..b064fe8c00 100644 --- a/linearmodels/tests/system/test_utility.py +++ b/linearmodels/tests/system/test_utility.py @@ -13,7 +13,7 @@ blocked_inner_prod, inv_matrix_sqrt) -@pytest.fixture(params=(3, np.arange(1, 6)), ids=['common-size', 'different-size']) +@pytest.fixture(params=(3, np.arange(1, 6)), ids=["common-size", "different-size"]) def data(request): k = 5 t = 200 @@ -151,7 +151,7 @@ def test_linear_constraint_repr(): r = np.eye(10) lc = LinearConstraint(r, require_pandas=False) assert hex(id(lc)) in lc.__repr__() - assert '10 constraints' in lc.__repr__() + assert "10 constraints" in lc.__repr__() assert isinstance(lc.q, pd.Series) assert np.all(lc.q == 0) assert lc.q.shape == (10,) @@ -198,4 +198,5 @@ def test_blocked_outer_product(): desired = _x.T @ np.kron(s, np.eye(nobs)) @ _z assert_allclose(actual, desired) + # TODO: One complex constrain test of equivalence diff --git a/linearmodels/tests/test_compat.py b/linearmodels/tests/test_compat.py index 5127bfb3d5..8ec737081b 100644 --- a/linearmodels/tests/test_compat.py +++ b/linearmodels/tests/test_compat.py @@ -7,13 +7,13 @@ from linearmodels.utility import AttrDict -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): - idx = date_range('2000-01-01', periods=100) - df1 = DataFrame(np.arange(100)[:, None], columns=['A'], index=idx) + idx = date_range("2000-01-01", periods=100) + df1 = DataFrame(np.arange(100)[:, None], columns=["A"], index=idx) x = np.reshape(np.arange(200), (100, 2)) - df2 = DataFrame(x, columns=['B', 'C'], index=idx[::-1]) - s = Series(300 + np.arange(100), index=idx, name='D') + df2 = DataFrame(x, columns=["B", "C"], index=idx[::-1]) + s = Series(300 + np.arange(100), index=idx, name="D") return AttrDict(df1=df1, df2=df2, s=s) @@ -22,7 +22,7 @@ def test_concat_sort(data): b = concat([data.df1, data.df2, data.s], 1) c = concat([data.df1, data.df2, data.s], 1, sort=True) d = concat([data.df2, data.df1, data.s], 1, sort=False) - assert list(a.columns) == ['A', 'B', 'C'] - assert list(b.columns) == ['A', 'B', 'C', 'D'] - assert list(c.columns) == ['A', 'B', 'C', 'D'] - assert list(d.columns) == ['B', 'C', 'A', 'D'] + assert list(a.columns) == ["A", "B", "C"] + assert list(b.columns) == ["A", "B", "C", "D"] + assert list(c.columns) == ["A", "B", "C", "D"] + assert list(d.columns) == ["B", "C", "A", "D"] diff --git a/linearmodels/tests/test_examples.py b/linearmodels/tests/test_examples.py index 435cedcd2a..3d95616b72 100644 --- a/linearmodels/tests/test_examples.py +++ b/linearmodels/tests/test_examples.py @@ -15,20 +15,21 @@ import jupyter_client import nbformat from nbconvert.preprocessors import ExecutePreprocessor + kernels = jupyter_client.kernelspec.find_kernel_specs() except ImportError: # pragma: no cover - pytest.mark.skip(reason='Required packages not available') + pytest.mark.skip(reason="Required packages not available") -kernel_name = 'python%s' % sys.version_info.major +kernel_name = "python%s" % sys.version_info.major head, _ = os.path.split(__file__) -NOTEBOOKS_USING_XARRAY = ['panel_data-formats.ipynb'] -NOTEBOOK_DIR = os.path.abspath(os.path.join(head, '..', '..', 'examples')) +NOTEBOOKS_USING_XARRAY = ["panel_data-formats.ipynb"] +NOTEBOOK_DIR = os.path.abspath(os.path.join(head, "..", "..", "examples")) -nbs = sorted(glob.glob(os.path.join(NOTEBOOK_DIR, '*.ipynb'))) -ids = list(map(lambda s: os.path.split(s)[-1].split('.')[0], nbs)) +nbs = sorted(glob.glob(os.path.join(NOTEBOOK_DIR, "*.ipynb"))) +ids = list(map(lambda s: os.path.split(s)[-1].split(".")[0], nbs)) if not nbs: # pragma: no cover - pytest.mark.skip(reason='No notebooks found so not tests run') + pytest.mark.skip(reason="No notebooks found so not tests run") @pytest.fixture(params=nbs, ids=ids) @@ -40,10 +41,8 @@ def notebook(request): def test_notebook(notebook): nb_name = os.path.split(notebook)[-1] if MISSING_XARRAY and nb_name in NOTEBOOKS_USING_XARRAY: - pytest.skip('xarray is required to test {0}'.format(notebook)) + pytest.skip("xarray is required to test {0}".format(notebook)) nb = nbformat.read(notebook, as_version=4) - ep = ExecutePreprocessor(allow_errors=False, - timeout=120, - kernel_name=kernel_name) - ep.preprocess(nb, {'metadata': {'path': NOTEBOOK_DIR}}) + ep = ExecutePreprocessor(allow_errors=False, timeout=120, kernel_name=kernel_name) + ep.preprocess(nb, {"metadata": {"path": NOTEBOOK_DIR}}) diff --git a/linearmodels/tests/test_utility.py b/linearmodels/tests/test_utility.py index 64659fdd8e..b14fd3e479 100644 --- a/linearmodels/tests/test_utility.py +++ b/linearmodels/tests/test_utility.py @@ -15,7 +15,7 @@ has_constant, inv_sqrth, missing_warning, panel_to_frame) -MISSING_PANEL = 'Panel' not in dir(pd) +MISSING_PANEL = "Panel" not in dir(pd) def test_missing_warning(): @@ -60,38 +60,38 @@ def test_hasconstant(): def test_wald_statistic(): ts = WaldTestStatistic(1.0, "_NULL_", 1, name="_NAME_") assert str(hex(id(ts))) in ts.__repr__() - assert '_NULL_' in str(ts) + assert "_NULL_" in str(ts) assert ts.stat == 1.0 assert ts.df == 1 assert ts.df_denom is None - assert ts.dist_name == 'chi2(1)' + assert ts.dist_name == "chi2(1)" assert isinstance(ts.critical_values, dict) assert_allclose(1 - stats.chi2.cdf(1.0, 1), ts.pval) ts = WaldTestStatistic(1.0, "_NULL_", 1, 1000, name="_NAME_") assert ts.df == 1 assert ts.df_denom == 1000 - assert ts.dist_name == 'F(1,1000)' + assert ts.dist_name == "F(1,1000)" assert_allclose(1 - stats.f.cdf(1.0, 1, 1000), ts.pval) def test_invalid_test_statistic(): - ts = InvalidTestStatistic('_REASON_', name='_NAME_') + ts = InvalidTestStatistic("_REASON_", name="_NAME_") assert str(hex(id(ts))) in ts.__repr__() - assert '_REASON_' in str(ts) + assert "_REASON_" in str(ts) assert np.isnan(ts.pval) assert ts.critical_values is None def test_inapplicable_test_statistic(): - ts = InapplicableTestStatistic(reason='_REASON_', name='_NAME_') + ts = InapplicableTestStatistic(reason="_REASON_", name="_NAME_") assert str(hex(id(ts))) in ts.__repr__() - assert '_REASON_' in str(ts) + assert "_REASON_" in str(ts) assert np.isnan(ts.pval) assert ts.critical_values is None ts = InapplicableTestStatistic() - assert 'not applicable' in str(ts) + assert "not applicable" in str(ts) def test_inv_sqrth(): @@ -103,52 +103,52 @@ def test_inv_sqrth(): def test_ensure_unique_column(): - df = pd.DataFrame({'a': [0, 1, 0], 'b': [1.0, 0.0, 1.0]}) - out = ensure_unique_column('a', df) - assert out == '_a_' - out = ensure_unique_column('c', df) - assert out == 'c' - out = ensure_unique_column('a', df, '=') - assert out == '=a=' - df['_a_'] = -1 - out = ensure_unique_column('a', df) - assert out == '__a__' + df = pd.DataFrame({"a": [0, 1, 0], "b": [1.0, 0.0, 1.0]}) + out = ensure_unique_column("a", df) + assert out == "_a_" + out = ensure_unique_column("c", df) + assert out == "c" + out = ensure_unique_column("a", df, "=") + assert out == "=a=" + df["_a_"] = -1 + out = ensure_unique_column("a", df) + assert out == "__a__" def test_attr_dict(): ad = AttrDict() - ad['one'] = 'one' + ad["one"] = "one" ad[1] = 1 - ad[('a', 2)] = ('a', 2) - assert list(ad.keys()) == ['one', 1, ('a', 2)] + ad[("a", 2)] = ("a", 2) + assert list(ad.keys()) == ["one", 1, ("a", 2)] assert len(ad) == 3 ad2 = ad.copy() assert list(ad2.keys()) == list(ad.keys()) - assert ad.get('one', None) == 'one' - assert ad.get('two', False) is False + assert ad.get("one", None) == "one" + assert ad.get("two", False) is False k, v = ad.popitem() - assert k == 'one' - assert v == 'one' + assert k == "one" + assert v == "one" items = ad.items() assert (1, 1) in items - assert (('a', 2), ('a', 2)) in items + assert (("a", 2), ("a", 2)) in items assert len(items) == 2 values = ad.values() assert 1 in values - assert ('a', 2) in values + assert ("a", 2) in values assert len(values) == 2 ad2 = AttrDict() ad2[1] = 3 - ad2['one'] = 'one' - ad2['a'] = 'a' + ad2["one"] = "one" + ad2["a"] = "a" ad.update(ad2) assert ad[1] == 3 - assert 'a' in ad + assert "a" in ad ad.__str__() with pytest.raises(AttributeError): @@ -156,23 +156,23 @@ def test_attr_dict(): with pytest.raises(AttributeError): ad.some_other_key with pytest.raises(KeyError): - ad['__ordered_dict__'] = None + ad["__ordered_dict__"] = None del ad[1] assert 1 not in ad.keys() - ad.new_value = 'new_value' - assert 'new_value' in ad.keys() - assert ad.new_value == ad['new_value'] + ad.new_value = "new_value" + assert "new_value" in ad.keys() + assert ad.new_value == ad["new_value"] for key in ad.keys(): if isinstance(key, str): assert key in dir(ad) - new_value = ad.pop('new_value') - assert new_value == 'new_value' + new_value = ad.pop("new_value") + assert new_value == "new_value" del ad.one - assert 'one' not in ad.keys() + assert "one" not in ad.keys() ad.clear() assert list(ad.keys()) == [] @@ -202,15 +202,22 @@ def test_panel_to_midf(): df2 = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100)), True) pd.testing.assert_frame_equal(df2, expected2) - entities = list(map(''.join, [[random.choice(string.ascii_lowercase) for __ in range(10)] - for _ in range(100)])) - times = pd.date_range('1999-12-31', freq='A-DEC', periods=7) - var_names = ['x.{0}'.format(i) for i in range(1, 4)] + entities = list( + map( + "".join, + [ + [random.choice(string.ascii_lowercase) for __ in range(10)] + for _ in range(100) + ], + ) + ) + times = pd.date_range("1999-12-31", freq="A-DEC", periods=7) + var_names = ["x.{0}".format(i) for i in range(1, 4)] df3 = panel_to_frame(x, var_names, times, entities, True) mi = pd.MultiIndex.from_product([times, entities]) expected3 = pd.DataFrame(index=mi, columns=var_names) for i in range(1, 4): - expected3['x.{0}'.format(i)] = x[i-1].ravel() + expected3["x.{0}".format(i)] = x[i - 1].ravel() expected3.index = expected3.index.swaplevel(0, 1) mi = pd.MultiIndex.from_product([entities, times]) expected3 = expected3.loc[mi] diff --git a/linearmodels/utility.py b/linearmodels/utility.py index ac380bd145..cde1883ca6 100644 --- a/linearmodels/utility.py +++ b/linearmodels/utility.py @@ -81,13 +81,13 @@ def __len__(self): def __repr__(self): out = self.__ordered_dict__.__str__() - return 'Attr' + out[7:] + return "Attr" + out[7:] def __str__(self): return self.__repr__() def __init__(self, *args, **kwargs): - self.__dict__['__ordered_dict__'] = OrderedDict(*args, **kwargs) + self.__dict__["__ordered_dict__"] = OrderedDict(*args, **kwargs) def __contains__(self, item): return self.__ordered_dict__.__contains__(item) @@ -96,8 +96,8 @@ def __getitem__(self, item): return self.__ordered_dict__[item] def __setitem__(self, key, value): - if key == '__ordered_dict__': - raise KeyError(key + ' is reserved and cannot be set.') + if key == "__ordered_dict__": + raise KeyError(key + " is reserved and cannot be set.") self.__ordered_dict__[key] = value def __delitem__(self, key): @@ -109,8 +109,8 @@ def __getattr__(self, item): return self.__ordered_dict__[item] def __setattr__(self, key, value): - if key == '__ordered_dict__': - raise AttributeError(key + ' is invalid') + if key == "__ordered_dict__": + raise AttributeError(key + " is invalid") self.__ordered_dict__[key] = value def __delattr__(self, name): @@ -177,7 +177,7 @@ def inv_sqrth(x): Returns ------- - invsqrt : ndarray + ndarray Input to the power -1/2 """ vals, vecs = np.linalg.eigh(x) @@ -215,10 +215,10 @@ def __init__(self, stat, null, df, df_denom=None, name=None): self._name = name if df_denom is None: self.dist = chi2(df) - self.dist_name = 'chi2({0})'.format(df) + self.dist_name = "chi2({0})".format(df) else: self.dist = f(df, df_denom) - self.dist_name = 'F({0},{1})'.format(df, df_denom) + self.dist_name = "F({0},{1})".format(df, df_denom) @property def stat(self): @@ -233,8 +233,7 @@ def pval(self): @property def critical_values(self): """Critical values test for common test sizes""" - return OrderedDict(zip(['10%', '5%', '1%'], - self.dist.ppf([.9, .95, .99]))) + return OrderedDict(zip(["10%", "5%", "1%"], self.dist.ppf([0.9, 0.95, 0.99]))) @property def null(self): @@ -242,16 +241,26 @@ def null(self): return self._null def __str__(self): - name = '' if not self._name else self._name + '\n' - msg = '{name}H0: {null}\nStatistic: {stat:0.4f}\n' \ - 'P-value: {pval:0.4f}\nDistributed: {dist}' - return msg.format(name=name, null=self.null, stat=self.stat, - pval=self.pval, dist=self.dist_name) + name = "" if not self._name else self._name + "\n" + msg = ( + "{name}H0: {null}\nStatistic: {stat:0.4f}\n" + "P-value: {pval:0.4f}\nDistributed: {dist}" + ) + return msg.format( + name=name, + null=self.null, + stat=self.stat, + pval=self.pval, + dist=self.dist_name, + ) def __repr__(self): - return self.__str__() + '\n' + \ - self.__class__.__name__ + \ - ', id: {0}'.format(hex(id(self))) + return ( + self.__str__() + + "\n" + + self.__class__.__name__ + + ", id: {0}".format(hex(id(self))) + ) class InvalidTestWarning(UserWarning): @@ -276,8 +285,10 @@ class InvalidTestStatistic(WaldTestStatistic): def __init__(self, reason, *, name=None): self._reason = reason - super(InvalidTestStatistic, self).__init__(np.NaN, np.NaN, df=1, df_denom=1, name=name) - self.dist_name = 'None' + super(InvalidTestStatistic, self).__init__( + np.NaN, np.NaN, df=1, df_denom=1, name=name + ) + self.dist_name = "None" @property def pval(self): @@ -291,7 +302,7 @@ def critical_values(self): def __str__(self): msg = "Invalid test statistic\n{reason}\n{name}" - name = '' if self._name is None else self._name + name = "" if self._name is None else self._name return msg.format(name=name, reason=self._reason) @@ -314,11 +325,12 @@ class InapplicableTestStatistic(WaldTestStatistic): def __init__(self, *, reason=None, name=None): self._reason = reason if reason is None: - self._reason = 'Test is not applicable to model specification' + self._reason = "Test is not applicable to model specification" - super(InapplicableTestStatistic, self).__init__(np.NaN, np.NaN, df=1, df_denom=1, - name=name) - self.dist_name = 'None' + super(InapplicableTestStatistic, self).__init__( + np.NaN, np.NaN, df=1, df_denom=1, name=name + ) + self.dist_name = "None" @property def pval(self): @@ -332,35 +344,35 @@ def critical_values(self): def __str__(self): msg = "Irrelevant test statistic\n{reason}\n{name}" - name = '' if self._name is None else self._name + name = "" if self._name is None else self._name return msg.format(name=name, reason=self._reason) def _str(v): """Preferred basic formatter""" if np.isnan(v): - return ' ' + return " " av = abs(v) digits = 0 if av != 0: digits = np.ceil(np.log10(av)) if digits > 4 or digits <= -4: - return '{0:8.4g}'.format(v) + return "{0:8.4g}".format(v) if digits > 0: d = int(5 - digits) else: d = int(4) - format_str = '{0:' + '0.{0}f'.format(d) + '}' + format_str = "{0:" + "0.{0}f".format(d) + "}" return format_str.format(v) def pval_format(v): """Preferred formatting for x in [0,1]""" if np.isnan(v): - return ' ' - return '{0:4.4f}'.format(v) + return " " + return "{0:4.4f}".format(v) class _SummaryStr(object): @@ -368,15 +380,18 @@ def __str__(self): return self.summary.as_text() def __repr__(self): - return self.__str__() + '\n' + \ - self.__class__.__name__ + \ - ', id: {0}'.format(hex(id(self))) + return ( + self.__str__() + + "\n" + + self.__class__.__name__ + + ", id: {0}".format(hex(id(self))) + ) def _repr_html_(self): - return self.summary.as_html() + '
id: {0}'.format(hex(id(self))) + return self.summary.as_html() + "
id: {0}".format(hex(id(self))) -def ensure_unique_column(col_name, df, addition='_'): +def ensure_unique_column(col_name, df, addition="_"): while col_name in df: col_name = addition + col_name + addition return col_name @@ -386,16 +401,19 @@ class _ModelComparison(_SummaryStr): """ Base class for model comparisons """ + _supported = tuple() - _PRECISION_TYPES = {'tstats': 'T-stats', - 'pvalues': 'P-values', - 'std_errors': 'Std. Errors'} + _PRECISION_TYPES = { + "tstats": "T-stats", + "pvalues": "P-values", + "std_errors": "Std. Errors", + } - def __init__(self, results, *, precision='tstats'): + def __init__(self, results, *, precision="tstats"): if not isinstance(results, (dict, OrderedDict)): _results = OrderedDict() for i, res in enumerate(results): - _results['Model ' + str(i)] = res + _results["Model " + str(i)] = res results = _results elif not isinstance(results, OrderedDict): _results = OrderedDict() @@ -406,15 +424,17 @@ def __init__(self, results, *, precision='tstats'): for key in self._results: if not isinstance(self._results[key], self._supported): - raise TypeError('Results from unknown model') - precision = precision.lower().replace('-', '_') - if precision not in ('tstats', 'pvalues', 'std_errors'): - raise ValueError('Unknown precision value. Must be one of \'tstats\', \'std_errors\' ' - 'or \'pvalues\'.') + raise TypeError("Results from unknown model") + precision = precision.lower().replace("-", "_") + if precision not in ("tstats", "pvalues", "std_errors"): + raise ValueError( + "Unknown precision value. Must be one of 'tstats', 'std_errors' " + "or 'pvalues'." + ) self._precision = precision def _get_series_property(self, name): - out = ([(k, getattr(v, name)) for k, v in self._results.items()]) + out = [(k, getattr(v, name)) for k, v in self._results.items()] cols = [v[0] for v in out] values = concat([v[1] for v in out], 1) values.columns = cols @@ -431,38 +451,40 @@ def _get_property(self, name): @property def nobs(self): """Parameters for all models""" - return self._get_property('nobs') + return self._get_property("nobs") @property def params(self): """Parameters for all models""" - return self._get_series_property('params') + return self._get_series_property("params") @property def tstats(self): """Parameter t-stats for all models""" - return self._get_series_property('tstats') + return self._get_series_property("tstats") @property def std_errors(self): """Parameter t-stats for all models""" - return self._get_series_property('std_errors') + return self._get_series_property("std_errors") @property def pvalues(self): """Parameter p-vals for all models""" - return self._get_series_property('pvalues') + return self._get_series_property("pvalues") @property def rsquared(self): """Coefficients of determination (R**2)""" - return self._get_property('rsquared') + return self._get_property("rsquared") @property def f_statistic(self): """F-statistics and P-values""" - out = self._get_property('f_statistic') - out_df = DataFrame(np.empty((len(out), 2)), columns=['F stat', 'P-value'], index=out.index) + out = self._get_property("f_statistic") + out_df = DataFrame( + np.empty((len(out), 2)), columns=["F stat", "P-value"], index=out.index + ) for loc in out.index: out_df.loc[loc] = out[loc].stat, out[loc].pval return out_df @@ -473,18 +495,22 @@ def missing_warning(missing): if not np.any(missing): return import linearmodels + if linearmodels.WARN_ON_MISSING: import warnings + warnings.warn(missing_value_warning_msg, MissingValueWarning) def param_table(results, title, pad_bottom=False): """Formatted standard parameter table""" - param_data = np.c_[np.asarray(results.params)[:, None], - np.asarray(results.std_errors)[:, None], - np.asarray(results.tstats)[:, None], - np.asarray(results.pvalues)[:, None], - results.conf_int()] + param_data = np.c_[ + np.asarray(results.params)[:, None], + np.asarray(results.std_errors)[:, None], + np.asarray(results.tstats)[:, None], + np.asarray(results.pvalues)[:, None], + results.conf_int(), + ] data = [] for row in param_data: txt_row = [] @@ -494,19 +520,22 @@ def param_table(results, title, pad_bottom=False): f = pval_format txt_row.append(f(v)) data.append(txt_row) - header = ['Parameter', 'Std. Err.', 'T-stat', 'P-value', 'Lower CI', 'Upper CI'] + header = ["Parameter", "Std. Err.", "T-stat", "P-value", "Lower CI", "Upper CI"] table_stubs = list(results.params.index) if pad_bottom: # Append blank row for spacing - data.append([''] * 6) - table_stubs += [''] + data.append([""] * 6) + table_stubs += [""] - return SimpleTable(data, stubs=table_stubs, txt_fmt=fmt_params, - headers=header, title=title) + return SimpleTable( + data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title + ) def format_wide(s, cols): """ + Format a list of strings. + Parameters ---------- s : List[str] @@ -516,25 +545,25 @@ def format_wide(s, cols): Returns ------- - formatted : List[List[str]] - Joined list: + List[List[str]] + The joined list. """ lines = [] - line = '' + line = "" for i, val in enumerate(s): - if line == '': + if line == "": line = val if i + 1 != len(s): - line += ', ' + line += ", " else: temp = line + val if i + 1 != len(s): - temp += ', ' + temp += ", " if len(temp) > cols: lines.append([line]) line = val if i + 1 != len(s): - line += ', ' + line += ", " else: line = temp lines.append([line]) @@ -581,14 +610,13 @@ def panel_to_frame(x, items, major_axis, minor_axis, swap=False): df.sort_index(inplace=True) final_levels = [minor_axis, major_axis] df.index.set_levels(final_levels, [0, 1], inplace=True) - df.index.names = ['major', 'minor'] + df.index.names = ["major", "minor"] return df def quadratic_form_test(params, cov, restriction=None, value=None, formula=None): if formula is not None and restriction is not None: - raise ValueError('restriction and formula cannot be used' - 'simultaneously.') + raise ValueError("restriction and formula cannot be used" "simultaneously.") if formula is not None: di = DesignInfo(list(params.index)) lc = di.linear_constraint(formula) @@ -601,7 +629,7 @@ def quadratic_form_test(params, cov, restriction=None, value=None, formula=None) rcov = restriction @ cov @ restriction.T stat = float(diff.T @ np.linalg.inv(rcov) @ diff) df = restriction.shape[0] - null = 'Linear equality constraint is valid' - name = 'Linear Equality Hypothesis Test' + null = "Linear equality constraint is valid" + name = "Linear Equality Hypothesis Test" return WaldTestStatistic(stat, null, df, name=name) diff --git a/pyproject.toml b/pyproject.toml index 9bb791f41e..b630094f91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,2 @@ [build-system] -requires = ["setuptools", "wheel"] +requires = ["setuptools", "wheel", "Cython>=0.29.14"] diff --git a/requirements-dev.txt b/requirements-dev.txt index eb844d739c..535beb58f6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,8 @@ xarray>=0.9 -pytest>=5 +pytest>=5.0 +black==19.10b0 +flake8 +flake8-black sphinx sphinx-material ipython @@ -9,4 +12,3 @@ nbconvert nbformat matplotlib seaborn -Cython diff --git a/requirements.txt b/requirements.txt index d07bfc7a71..8024e0282e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ statsmodels>=0.9 patsy property_cached>=1.6.3 mypy_extensions>=0.4 +Cython>=0.29.14 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 749c88eb24..33bba4f065 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,6 @@ [flake8] max-line-length = 99 +ignore = E203,W503,BLK100 [versioneer] VCS = git @@ -44,4 +45,4 @@ multi_line_output=0 force_grid_wrap=0 combine_as_imports=True force_sort_within_sections=True -line_width=99 +line_width=88 diff --git a/setup.py b/setup.py index 9929774e5e..f2809e70ad 100644 --- a/setup.py +++ b/setup.py @@ -22,70 +22,45 @@ """ try: - markdown = os.stat('README.md').st_mtime - if os.path.exists('README.rst'): - rst = os.stat('README.rst').st_mtime + markdown = os.stat("README.md").st_mtime + if os.path.exists("README.rst"): + rst = os.stat("README.rst").st_mtime else: rst = markdown - 1 if rst >= markdown: - with open('README.rst', 'r') as rst: + with open("README.rst", "r") as rst: description = rst.read() else: import pypandoc - osx_line_ending = '\r' - windows_line_ending = '\r\n' - linux_line_ending = '\n' + osx_line_ending = "\r" + windows_line_ending = "\r\n" + linux_line_ending = "\n" - description = pypandoc.convert_file('README.md', 'rst') + description = pypandoc.convert_file("README.md", "rst") description = description.replace(windows_line_ending, linux_line_ending) description = description.replace(osx_line_ending, linux_line_ending) - with open('README.rst', 'w') as rst: + with open("README.rst", "w") as rst: rst.write(description) except (ImportError, OSError): import warnings warnings.warn("Unable to convert README.md to README.rst", UserWarning) - description = open('README.md').read() - -# Copy over notebooks from examples to docs for build -notebooks = glob.glob('examples/*.ipynb') -for nb in notebooks: - fname = os.path.split(nb)[-1] - folder, nbname = fname.split('_') - outdir = os.path.join('doc', 'source', folder, 'examples') - if not os.path.exists(outdir): - os.makedirs(outdir, exist_ok=True) - outfile = os.path.join(outdir, nbname) - with open(outfile, 'w') as nbout: - with open(nb, 'r') as nbin: - nbout.write(nbin.read()) - -images = glob.glob('examples/*.png') -for image in images: - fname = os.path.split(image)[-1] - folder, _ = fname.split('_') - outdir = os.path.join('doc', 'source', folder, 'examples') - if not os.path.exists(outdir): - os.makedirs(outdir, exist_ok=True) - outfile = os.path.join(outdir, fname) - with open(outfile, 'wb') as imageout: - with open(image, 'rb') as imagein: - imageout.write(imagein.read()) + description = open("README.md").read() additional_files = [] -for filename in glob.iglob('./linearmodels/datasets/**', recursive=True): - if '.csv.bz' in filename: - additional_files.append(filename.replace('./linearmodels/', '')) +for filename in glob.iglob("./linearmodels/datasets/**", recursive=True): + if ".csv.bz" in filename: + additional_files.append(filename.replace("./linearmodels/", "")) -for filename in glob.iglob('./linearmodels/tests/**', recursive=True): - if '.txt' in filename or '.csv' in filename or '.dta' in filename: - additional_files.append(filename.replace('./linearmodels/', '')) +for filename in glob.iglob("./linearmodels/tests/**", recursive=True): + if ".txt" in filename or ".csv" in filename or ".dta" in filename: + additional_files.append(filename.replace("./linearmodels/", "")) -for filename in glob.iglob('./examples/**', recursive=True): - if '.png' in filename: +for filename in glob.iglob("./examples/**", recursive=True): + if ".png" in filename: additional_files.append(filename) @@ -94,59 +69,80 @@ def run_setup(binary=True): if binary: from Cython.Build import cythonize import numpy - macros = [('NPY_NO_DEPRECATED_API', '1')] + + macros = [("NPY_NO_DEPRECATED_API", "1")] # macros.append(('CYTHON_TRACE', '1')) directives = {} # {'linetrace': True, 'binding':True} - extension = Extension('linearmodels.panel._utility', - ['linearmodels/panel/_utility.pyx'], - define_macros=macros, - include_dirs=[numpy.get_include()]) + extension = Extension( + "linearmodels.panel._utility", + ["linearmodels/panel/_utility.pyx"], + define_macros=macros, + include_dirs=[numpy.get_include()], + ) extensions.append(extension) extensions = cythonize(extensions, compiler_directives=directives, force=True) - setup(cmdclass=versioneer.get_cmdclass(), - name='linearmodels', - license='NCSA', - description='Instrumental Variable and Linear Panel models for Python', - version=versioneer.get_version(), - packages=find_packages(), - package_dir={'linearmodels': './linearmodels'}, - author='Kevin Sheppard', - author_email='kevin.k.sheppard@gmail.com', - url='http://github.com/bashtage/linearmodels', - long_description=description, - install_requires=open('requirements.txt').read().split('\n'), - include_package_data=True, - package_data={'linearmodels': additional_files}, - keywords=['linear models', 'regression', 'instrumental variables', 'IV', - 'panel', 'fixed effects', 'clustered', 'heteroskedasticity', - 'endogeneity', 'instruments', 'statistics', - 'statistical inference', 'econometrics'], - zip_safe=False, - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: End Users/Desktop', - 'Intended Audience :: Financial and Insurance Industry', - 'Intended Audience :: Science/Research', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'License :: OSI Approved', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: Microsoft :: Windows', - 'Operating System :: POSIX', - 'Programming Language :: Python', - 'Topic :: Scientific/Engineering', - ], - ext_modules=extensions, - python_requires='>=3.6', - ) + setup( + cmdclass=versioneer.get_cmdclass(), + name="linearmodels", + license="NCSA", + description="Instrumental Variable and Linear Panel models for Python", + version=versioneer.get_version(), + packages=find_packages(), + package_dir={"linearmodels": "./linearmodels"}, + author="Kevin Sheppard", + author_email="kevin.k.sheppard@gmail.com", + url="http://github.com/bashtage/linearmodels", + long_description=description, + install_requires=open("requirements.txt").read().split("\n"), + include_package_data=True, + package_data={"linearmodels": additional_files}, + keywords=[ + "linear models", + "regression", + "instrumental variables", + "IV", + "panel", + "fixed effects", + "clustered", + "heteroskedasticity", + "endogeneity", + "instruments", + "statistics", + "statistical inference", + "econometrics", + ], + zip_safe=False, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: Financial and Insurance Industry", + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "License :: OSI Approved", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Programming Language :: Python", + "Topic :: Scientific/Engineering", + ], + ext_modules=extensions, + python_requires=">=3.6", + ) try: run_setup(binary=True) -except (CCompilerError, DistutilsExecError, DistutilsPlatformError, IOError, ValueError, - ImportError): +except ( + CCompilerError, + DistutilsExecError, + DistutilsPlatformError, + IOError, + ValueError, + ImportError, +): run_setup(binary=False) import warnings