diff --git a/.github/actions/install/action.yml b/.github/actions/install/action.yml new file mode 100644 index 0000000..ba69759 --- /dev/null +++ b/.github/actions/install/action.yml @@ -0,0 +1,21 @@ +name: "install" +description: "install requirements" + +inputs: + python-version: + required: false + description: "python version" + default: "3.10" +outputs: {} +runs: + using: "composite" + steps: + - uses: actions/setup-python@v4 + with: + python-version: ${{inputs.python-version}} + - name: install requirements + run: pip install -r requirements.txt + shell: bash + - name: install package + run: pip install . + shell: bash diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0b19081 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,33 @@ +name: ci + +on: + pull_request: + push: + branches: + - "**" + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - name: Checkout code repository + uses: actions/checkout@v4 + - name: Install dependencies + uses: ./.github/actions/install + - name: Run pre-commit + run: pre-commit run --all-files + + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - name: Checkout code repository + uses: actions/checkout@v4 + - name: Install dependencies + uses: ./.github/actions/install + - name: Run unit tests + run: pytest + env: + python-version: ${{ matrix.python-version }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3373afd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-ast + - id: check-added-large-files + - id: check-merge-conflict + - id: check-case-conflict + - id: check-docstring-first + - id: check-json + - id: check-yaml + - id: debug-statements + - id: end-of-file-fixer + - id: trailing-whitespace + - id: mixed-line-ending +- repo: local + hooks: + - id: black + name: black + entry: black . + language: system + types: [python] + - id: flake8 + name: flake8 + entry: flake8 eparse tests + language: system + types: [python] diff --git a/LICENSE b/LICENSE index 829dfec..8d64341 100644 --- a/LICENSE +++ b/LICENSE @@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/Makefile b/Makefile index e69dc12..e7038e7 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: clean clean-build clean-pyc clean-test coverage dist docs help install lint lint/flake8 lint/black +.PHONY: clean clean-build clean-pyc clean-test coverage dist docs help install lint lint/black lint/flake8 pre-commit test test-all .DEFAULT_GOAL := help define BROWSER_PYSCRIPT @@ -47,12 +47,16 @@ clean-test: ## remove test and coverage artifacts rm -fr htmlcov/ rm -fr .pytest_cache +lint/black: ## check style with black + black --diff eparse tests + lint/flake8: ## check style with flake8 flake8 eparse tests -lint/black: ## check style with black - black --check -S eparse tests -lint: lint/flake8 lint/black ## check style +lint: lint/black lint/flake8 ## check style + +pre-commit: ## run pre-commit on all files + pre-commit run --all-files test: ## run tests quickly with the default Python pytest diff --git a/README.rst b/README.rst index d789e44..bafbd7d 100644 --- a/README.rst +++ b/README.rst @@ -361,6 +361,45 @@ If you would like to use eparse to partition xls[x] files alongside unstructured Valid `eparse_mode` settings are available in `eparse.contrib.unstructured.xlsx._eparse_modes`. +Development +=========== +Clone the repo: + +.. code-block:: + + $ git clone https://github.com/ChrisPappalardo/eparse.git + +Install devtest requirements and the package in editable mode: + +.. code-block:: + + $ pip install -r requirements.txt + $ pip install -e . + +Run unit tests: + +.. code-block:: + + $ make test + +Run the linter: + +.. code-block:: + + $ make lint + +Install pre-commit: + +.. code-block:: + + $ pre-commit install + +Run pre-commit: + +.. code-block:: + + $ + Contributing ============ As an open-source project, contributions are always welcome. Please see `Contributing `_ for more information. diff --git a/conftest.py b/conftest.py index 470a625..c187253 100644 --- a/conftest.py +++ b/conftest.py @@ -1,25 +1,21 @@ # -*- coding: utf-8 -*- -''' +""" unit test fixtures -''' - -import pytest +""" import pandas as pd +import pytest from peewee import SqliteDatabase -from eparse.interfaces import ( - DATABASE, - ExcelParse, -) +from eparse.interfaces import DATABASE, ExcelParse @pytest.fixture def ctx(): - ''' + """ click style ctx object fixture - ''' + """ class Obj: obj = {} @@ -29,31 +25,31 @@ class Obj: @pytest.fixture def data(): - ''' + """ serialized data fixture - ''' + """ return dict( row=0, column=0, - value='test', - type='test', - c_header='test', - r_header='test', - excel_RC='A1', - name='test', - sheet='test', - f_name='test', + value="test", + type="test", + c_header="test", + r_header="test", + excel_RC="A1", + name="test", + sheet="test", + f_name="test", ) @pytest.fixture def sqlite3_db(data): - ''' + """ sqlite3 in-memory database fixture - ''' + """ - db = ':memory:' + db = ":memory:" DATABASE.initialize(SqliteDatabase(db)) DATABASE.connect() DATABASE.create_tables([ExcelParse]) @@ -65,12 +61,12 @@ def sqlite3_db(data): @pytest.fixture def xlsx(): - ''' + """ excel file fixture - ''' + """ return pd.read_excel( - 'tests/eparse_unit_test_data.xlsx', + "tests/eparse_unit_test_data.xlsx", header=None, index_col=None, ) diff --git a/eparse/contrib/unstructured/partition.py b/contrib/unstructured/partition.py similarity index 100% rename from eparse/contrib/unstructured/partition.py rename to contrib/unstructured/partition.py diff --git a/eparse/contrib/unstructured/xlsx.py b/contrib/unstructured/xlsx.py similarity index 96% rename from eparse/contrib/unstructured/xlsx.py rename to contrib/unstructured/xlsx.py index f5c64ab..e1b1893 100644 --- a/eparse/contrib/unstructured/xlsx.py +++ b/contrib/unstructured/xlsx.py @@ -3,13 +3,7 @@ from tempfile import SpooledTemporaryFile from typing import IO, BinaryIO, List, Optional, Union, cast -from eparse.core import ( - df_serialize_table, - get_df_from_file, - get_table_digest, -) import lxml.html - from unstructured.documents.elements import ( DataSourceMetadata, Element, @@ -25,12 +19,13 @@ spooled_to_bytes_io_if_needed, ) +from eparse.core import df_serialize_table, get_df_from_file, get_table_digest _eparse_modes = ( - 'eparse', - 'digest', - 'table-digest', - 'unstructured', + "eparse", + "digest", + "table-digest", + "unstructured", ) diff --git a/docs/conf.py b/docs/conf.py index 58e30ac..14ae283 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,10 +19,12 @@ # import os import sys -sys.path.insert(0, os.path.abspath('..')) import eparse +sys.path.insert(0, os.path.abspath("..")) + + # -- General configuration --------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. @@ -31,22 +33,22 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] +extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'eparse' +project = "eparse" copyright = "2023, Chris Pappalardo" author = "Chris Pappalardo" @@ -69,10 +71,10 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -83,7 +85,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -94,13 +96,13 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'eparsedoc' +htmlhelp_basename = "eparsedoc" # -- Options for LaTeX output ------------------------------------------ @@ -109,15 +111,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -127,9 +126,7 @@ # (source start file, target name, title, author, documentclass # [howto, manual, or own class]). latex_documents = [ - (master_doc, 'eparse.tex', - 'eparse Documentation', - 'Chris Pappalardo', 'manual'), + (master_doc, "eparse.tex", "eparse Documentation", "Chris Pappalardo", "manual"), ] @@ -137,11 +134,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'eparse', - 'eparse Documentation', - [author], 1) -] +man_pages = [(master_doc, "eparse", "eparse Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------- @@ -150,13 +143,13 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'eparse', - 'eparse Documentation', - author, - 'eparse', - 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "eparse", + "eparse Documentation", + author, + "eparse", + "One line description of project.", + "Miscellaneous", + ), ] - - - diff --git a/eparse/__init__.py b/eparse/__init__.py index f704fee..0f52bef 100644 --- a/eparse/__init__.py +++ b/eparse/__init__.py @@ -1,7 +1,7 @@ -''' +""" Top-level package for eparse. -''' +""" -__author__ = 'Chris Pappalardo' -__email__ = 'cpappala@gmail.com' -__version__ = '0.8.0' +__author__ = "Chris Pappalardo" +__email__ = "cpappala@gmail.com" +__version__ = "0.7.4" diff --git a/eparse/cli.py b/eparse/cli.py index dcdae89..74dd187 100644 --- a/eparse/cli.py +++ b/eparse/cli.py @@ -1,15 +1,15 @@ # -*- coding: utf-8 -*- -''' +""" excel parser cli module -''' +""" -import click +import sys from collections.abc import Iterable from pathlib import Path from pprint import PrettyPrinter -import sys +import click import pandas as pd from .core import ( @@ -22,12 +22,12 @@ def handle(e, exceptions=None, msg=None, debug=False, exit=True): - ''' + """ handle exceptions based on settings - ''' + """ if msg is None: - msg = f'an error occurred - {e}' + msg = f"an error occurred - {e}" if exceptions and not isinstance(exceptions, Iterable): exceptions = [exceptions] @@ -44,59 +44,59 @@ def handle(e, exceptions=None, msg=None, debug=False, exit=True): @click.group() @click.pass_context @click.option( - '--input', - '-i', + "--input", + "-i", type=str, - default='null:///', - help='input source', + default="null:///", + help="input source", ) @click.option( - '--output', - '-o', + "--output", + "-o", type=str, - default='null:///', - help='output destination', + default="null:///", + help="output destination", ) @click.option( - '--file', - '-f', + "--file", + "-f", type=str, multiple=True, - help='file(s) or dir(s) to target', + help="file(s) or dir(s) to target", ) @click.option( - '--debug', - '-d', + "--debug", + "-d", is_flag=True, default=False, - help='use debug mode', + help="use debug mode", ) @click.option( - '--loose', - '-l', + "--loose", + "-l", is_flag=True, default=True, - help='find tables loosely', + help="find tables loosely", ) @click.option( - '--recursive', - '-r', + "--recursive", + "-r", is_flag=True, default=False, - help='find files recursively', + help="find files recursively", ) @click.option( - '--truncate', - '-t', + "--truncate", + "-t", is_flag=True, default=True, - help='truncate dataframe output', + help="truncate dataframe output", ) @click.option( - '--verbose', - '-v', + "--verbose", + "-v", count=True, - help='increase output verbosity', + help="increase output verbosity", ) def main( ctx, @@ -109,85 +109,85 @@ def main( truncate, verbose, ): - ''' + """ excel parser - ''' + """ - ctx.obj['input'] = input - ctx.obj['output'] = output - ctx.obj['file'] = file - ctx.obj['debug'] = debug - ctx.obj['loose'] = loose - ctx.obj['recursive'] = recursive - ctx.obj['truncate'] = truncate - ctx.obj['verbose'] = verbose + ctx.obj["input"] = input + ctx.obj["output"] = output + ctx.obj["file"] = file + ctx.obj["debug"] = debug + ctx.obj["loose"] = loose + ctx.obj["recursive"] = recursive + ctx.obj["truncate"] = truncate + ctx.obj["verbose"] = verbose files = [] # get target file(s) for i in file: if Path(i).is_dir(): - g = '**/*' if recursive else '*' + g = "**/*" if recursive else "*" files += Path(i).glob(g) elif Path(i).is_file(): files.append(Path(i)) - ctx.obj['files'] = files + ctx.obj["files"] = files - if ctx.obj['verbose']: - print(f'found {len(files)} files') + if ctx.obj["verbose"]: + print(f"found {len(files)} files") # get input and output objects - for t in ('input', 'output'): + for t in ("input", "output"): try: - ctx.obj[f'{t}_obj'] = i_factory(ctx.obj[t], ExcelParse) + ctx.obj[f"{t}_obj"] = i_factory(ctx.obj[t], ExcelParse) except ValueError as e: - handle(e, msg=f'{t} error - {e}', debug=debug) + handle(e, msg=f"{t} error - {e}", debug=debug) # set truncate option if not truncate: # pd.set_option('display.max_colwidth', None) - pd.set_option('display.max_rows', None) + pd.set_option("display.max_rows", None) @main.command() @click.pass_context @click.option( - '--number', - '-n', + "--number", + "-n", type=int, default=None, - help='stop after n excel files', + help="stop after n excel files", ) @click.option( - '--sheet', - '-s', + "--sheet", + "-s", type=str, default=None, - help='name of sheet to scan for', + help="name of sheet to scan for", ) @click.option( - '--tables', - '-t', + "--tables", + "-t", is_flag=True, default=False, - help='count tables in scanned sheets', + help="count tables in scanned sheets", ) def scan(ctx, number, sheet, tables): - ''' + """ scan for excel files in target - ''' + """ - ctx.obj['number'] = number - ctx.obj['sheet'] = sheet - ctx.obj['tables'] = tables + ctx.obj["number"] = number + ctx.obj["sheet"] = sheet + ctx.obj["tables"] = tables - if ctx.obj['debug']: + if ctx.obj["debug"]: PrettyPrinter().pprint(ctx.obj) # process each Excel file in files - for i, f in enumerate(ctx.obj['files']): - if f.is_file() and 'xls' in f.name: + for i, f in enumerate(ctx.obj["files"]): + if f.is_file() and "xls" in f.name: try: e_file = pd.read_excel( f, @@ -196,8 +196,8 @@ def scan(ctx, number, sheet, tables): index_col=None, ) except Exception as e: - msg = f'skipping {f} - {e}' - handle(e, msg=msg, debug=ctx.obj['debug'], exit=False) + msg = f"skipping {f} - {e}" + handle(e, msg=msg, debug=ctx.obj["debug"], exit=False) continue # get basic info about Excel file @@ -208,32 +208,32 @@ def scan(ctx, number, sheet, tables): sheets = e_file.keys() # build output result based on options selected - result = f'{f.name}' + result = f"{f.name}" - if ctx.obj['verbose']: - result += f' {f_size_mb:.2f}MB' + if ctx.obj["verbose"]: + result += f" {f_size_mb:.2f}MB" if sheet is not None: - result += f' with {sheet} {e_file.shape}' + result += f" with {sheet} {e_file.shape}" if tables: - t = df_find_tables(e_file, ctx.obj['loose']) - result += f' containing {len(t)} tables' + t = df_find_tables(e_file, ctx.obj["loose"]) + result += f" containing {len(t)} tables" - if ctx.obj['verbose'] > 1: - result += f' ({t})' + if ctx.obj["verbose"] > 1: + result += f" ({t})" else: - if ctx.obj['verbose']: - result += f' with {len(sheets)} sheets' + if ctx.obj["verbose"]: + result += f" with {len(sheets)} sheets" - if ctx.obj['verbose'] > 1 and len(sheets): + if ctx.obj["verbose"] > 1 and len(sheets): result += f' {",".join(sheets)}' # print result print(result) - if ctx.obj['debug']: + if ctx.obj["debug"]: PrettyPrinter().pprint(e_file) # continue if number has not been reached @@ -244,49 +244,49 @@ def scan(ctx, number, sheet, tables): @main.command() @click.pass_context @click.option( - '--sheet', - '-s', + "--sheet", + "-s", type=str, multiple=True, - help='name of sheet(s) to parse', + help="name of sheet(s) to parse", ) @click.option( - '--serialize', - '-z', + "--serialize", + "-z", is_flag=True, default=False, - help='serialize table output', + help="serialize table output", ) @click.option( - '--table', - '-t', + "--table", + "-t", type=str, default=None, - help='name of table to parse', + help="name of table to parse", ) @click.option( - '--nacount', + "--nacount", type=int, default=0, - help='allow for this many NA values when spanning rows and columns', + help="allow for this many NA values when spanning rows and columns", ) def parse(ctx, sheet, serialize, table, nacount): - ''' + """ parse table(s) found in sheet for target(s) - ''' + """ - ctx.obj['sheet'] = sheet - ctx.obj['serialize'] = serialize - ctx.obj['table'] = table - ctx.obj['na_tolerance_r'] = nacount + 1 - ctx.obj['na_tolerance_c'] = nacount + 1 + ctx.obj["sheet"] = sheet + ctx.obj["serialize"] = serialize + ctx.obj["table"] = table + ctx.obj["na_tolerance_r"] = nacount + 1 + ctx.obj["na_tolerance_c"] = nacount + 1 - if ctx.obj['debug']: + if ctx.obj["debug"]: PrettyPrinter().pprint(ctx.obj) - for f in ctx.obj['files']: - if f.is_file() and 'xls' in f.name: - print(f'{f.name}') + for f in ctx.obj["files"]: + if f.is_file() and "xls" in f.name: + print(f"{f.name}") try: for ( @@ -296,14 +296,14 @@ def parse(ctx, sheet, serialize, table, nacount): s, ) in get_df_from_file( f, - ctx.obj['loose'], + ctx.obj["loose"], sheet, table, - ctx.obj['na_tolerance_r'], - ctx.obj['na_tolerance_c'], + ctx.obj["na_tolerance_r"], + ctx.obj["na_tolerance_c"], ): - if ctx.obj['verbose']: - m = '{} table {} {} found at {} in {}' + if ctx.obj["verbose"]: + m = "{} table {} {} found at {} in {}" v = (f.name, name, output.shape, excel_RC, s) print(m.format(*v)) @@ -315,115 +315,115 @@ def parse(ctx, sheet, serialize, table, nacount): f_name=f.name, ) - if ctx.obj['debug']: + if ctx.obj["debug"]: PrettyPrinter().pprint(output) try: - ctx.obj['output_obj'].output(output, ctx) + ctx.obj["output_obj"].output(output, ctx) except Exception as e: msg = f'output to {ctx.obj["output"]} failed - {e}' - handle(e, msg=msg, debug=ctx.obj['debug'], exit=False) + handle(e, msg=msg, debug=ctx.obj["debug"], exit=False) break except Exception as e: - msg = f'skipping {f} - {e}' - handle(e, msg=msg, debug=ctx.obj['debug'], exit=False) + msg = f"skipping {f} - {e}" + handle(e, msg=msg, debug=ctx.obj["debug"], exit=False) continue @main.command() @click.pass_context @click.option( - '--filter', - '-f', + "--filter", + "-f", type=str, nargs=2, multiple=True, - help='django-style filter(s) to apply to base queryset', + help="django-style filter(s) to apply to base queryset", ) @click.option( - '--method', - '-m', + "--method", + "-m", type=str, - default='get_queryset', - help='method to call on eparse model', + default="get_queryset", + help="method to call on eparse model", ) @click.option( - '--serialize', - '-z', + "--serialize", + "-z", is_flag=True, default=False, - help='serialize query output', + help="serialize query output", ) def query(ctx, filter, method, serialize): - ''' + """ query eparse output - ''' + """ - ctx.obj['filters'] = {k: v for k, v in filter} - ctx.obj['method'] = method + ctx.obj["filters"] = {k: v for k, v in filter} + ctx.obj["method"] = method - if ctx.obj['debug']: + if ctx.obj["debug"]: PrettyPrinter().pprint(ctx.obj) # input data try: - data = ctx.obj['input_obj'].input(method, **ctx.obj['filters']) + data = ctx.obj["input_obj"].input(method, **ctx.obj["filters"]) except Exception as e: msg = f'input from {ctx.obj["input"]} failed with {e}' - handle(e, msg=msg, debug=ctx.obj['debug']) + handle(e, msg=msg, debug=ctx.obj["debug"]) if serialize: try: - data = [df_normalize_data(d) for d in data.to_dict('records')] + data = [df_normalize_data(d) for d in data.to_dict("records")] except Exception as e: - msg = 'serialization error (some methods can\'t be serialized)' - handle(e, msg=f'{msg} - {e}', debug=ctx.obj['debug']) + msg = "serialization error (some methods can't be serialized)" + handle(e, msg=f"{msg} - {e}", debug=ctx.obj["debug"]) # output data try: - ctx.obj['output_obj'].output(data, ctx) + ctx.obj["output_obj"].output(data, ctx) except Exception as e: msg = f'output to {ctx.obj["output"]} failed with {e}' - handle(e, msg=msg, debug=ctx.obj['debug']) + handle(e, msg=msg, debug=ctx.obj["debug"]) @main.command() @click.pass_context @click.option( - '--migration', - '-m', + "--migration", + "-m", required=True, type=str, multiple=True, - help='database migration(s) to apply', + help="database migration(s) to apply", ) def migrate(ctx, migration): - ''' + """ migrate eparse table - ''' + """ - ctx.obj['migration'] = migration + ctx.obj["migration"] = migration - if ctx.obj['debug']: + if ctx.obj["debug"]: PrettyPrinter().pprint(ctx.obj) # apply migrations - for _migration in ctx.obj['migration']: + for _migration in ctx.obj["migration"]: try: - ctx.obj['input_obj'].migrate(_migration) - print(f'applied {_migration}') + ctx.obj["input_obj"].migrate(_migration) + print(f"applied {_migration}") except Exception as e: - handle(e, msg=f'migration error - {e}', debug=ctx.obj['debug']) + handle(e, msg=f"migration error - {e}", debug=ctx.obj["debug"]) def entry_point(): - ''' + """ required to make setuptools and click play nicely (context object) - ''' + """ return sys.exit(main(obj={})) -if __name__ == '__main__': +if __name__ == "__main__": entry_point() diff --git a/eparse/contrib/__init__.py b/eparse/contrib/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/eparse/contrib/unstructured/__init__.py b/eparse/contrib/unstructured/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/eparse/core.py b/eparse/core.py index 9476038..7239288 100644 --- a/eparse/core.py +++ b/eparse/core.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- -''' +""" excel parser core module -''' +""" +from io import StringIO from typing import Any, Dict, Iterable, List, Optional, Tuple -from openpyxl.utils.cell import get_column_letter import pandas as pd - +from openpyxl.utils.cell import get_column_letter TableRef = Tuple[int, int, str, str] # r, c, excel RC, value @@ -23,9 +23,9 @@ def df_find_tables( df: pd.DataFrame, loose: bool = False, ) -> List[TableRef]: - ''' + """ finds table corners in a dataframe - ''' + """ result = [] @@ -69,7 +69,7 @@ def df_find_tables( ( r, c, - f'{get_column_letter(c+1)}{r+1}', + f"{get_column_letter(c+1)}{r+1}", str(df.at[r, c]), ) ) @@ -78,9 +78,9 @@ def df_find_tables( def _is_rowspan(df: pd.DataFrame, r: int, c: int) -> bool: - ''' + """ detect a rowspan label - ''' + """ try: isna_right = pd.isna(df.at[r, (c + 1)]) @@ -93,9 +93,9 @@ def _is_rowspan(df: pd.DataFrame, r: int, c: int) -> bool: def _has_empty_corner(df: pd.DataFrame, r: int, c: int) -> bool: - ''' + """ detect an empty corner - ''' + """ try: isna_above = pd.isna(df.at[(r - 1), c]) @@ -115,9 +115,9 @@ def df_parse_table( na_tolerance_c: int = 1, na_strip: bool = True, ) -> pd.DataFrame: - ''' + """ extract a table from a dataframe for a given r, c position - ''' + """ # make reference adjustments if _is_rowspan(df, r, c): @@ -161,21 +161,21 @@ def df_parse_table( def df_normalize_data(data: Dict) -> Dict: - ''' + """ normalize table data - ''' + """ result = {} - ints = ('row', 'column') + ints = ("row", "column") strs = ( - 'value', - 'type', - 'c_header', - 'r_header', - 'excel_RC', - 'name', - 'sheet', - 'f_name', + "value", + "type", + "c_header", + "r_header", + "excel_RC", + "name", + "sheet", + "f_name", ) for k in ints: @@ -186,9 +186,9 @@ def df_normalize_data(data: Dict) -> Dict: if k in data: result[k] = str(data[k]) - if 'timestamp' in data: - if isinstance(data['timestamp'], pd.Timestamp): - result['timestamp'] = data['timestamp'].to_pydatetime() + if "timestamp" in data: + if isinstance(data["timestamp"], pd.Timestamp): + result["timestamp"] = data["timestamp"].to_pydatetime() return result @@ -197,9 +197,9 @@ def df_serialize_table( df: pd.DataFrame, **other_data, ) -> List[Dict]: - ''' + """ serialize table into a list of dicts with meta data - ''' + """ column_header = df.iloc[0] row_header = df.iloc[:, 0] @@ -213,13 +213,13 @@ def df_serialize_table( result.append( df_normalize_data( { - 'row': r, - 'column': c, - 'value': df.iloc[r, c], - 'type': type(df.iloc[r, c]), - 'c_header': column_header.iloc[c], - 'r_header': row_header.iloc[r], - 'excel_RC': f'{get_column_letter(_c+1)}{_r+1}', + "row": r, + "column": c, + "value": df.iloc[r, c], + "type": type(df.iloc[r, c]), + "c_header": column_header.iloc[c], + "r_header": row_header.iloc[r], + "excel_RC": f"{get_column_letter(_c+1)}{_r+1}", **other_data, } ) @@ -237,9 +237,9 @@ def get_df_from_file( na_tolerance_c: int = 1, na_strip: bool = True, ): - ''' + """ helper function to yield tables from a file - ''' + """ f = pd.read_excel( io, @@ -280,26 +280,26 @@ def get_table_digest( filename: Optional[str] = None, sheet: Optional[str] = None, ) -> str: - ''' + """ generate a digest that describes a serialized table - ''' + """ df = pd.DataFrame.from_records(serialized_table) - rows = len(df['row'].unique()) - cols = len(df['column'].unique()) - c_headers = df['c_header'].unique() - r_headers = df['r_header'].unique() - types = df['type'].unique() - - sheet_str = f' in sheet {sheet}' if sheet else '' - file_str = f' of Excel file {filename}' if filename else '' + rows = len(df["row"].unique()) + cols = len(df["column"].unique()) + c_headers = df["c_header"].unique() + r_headers = df["r_header"].unique() + types = df["type"].unique() + + sheet_str = f" in sheet {sheet}" if sheet else "" + file_str = f" of Excel file {filename}" if filename else "" type_str = f' {", ".join([str(t) for t in types])} type(s)' digest = ( - f'{table_name} is a table{sheet_str}{file_str} ' + f"{table_name} is a table{sheet_str}{file_str} " f'with {cols} column(s) having names like {", ".join(c_headers)} ' f'and {rows} row(s) having names like {", ".join(r_headers)} ' - f'and contains {rows*cols} cells of{type_str}' + f"and contains {rows*cols} cells of{type_str}" ) return digest @@ -308,12 +308,12 @@ def get_table_digest( def html_to_df( html: str, ) -> pd.DataFrame: - ''' + """ helper function to return pandas dataframe from html - ''' + """ return pd.read_html( - html, + StringIO(html), header=None, index_col=None, ) @@ -323,8 +323,8 @@ def html_to_serialized_data( html: str, **other_data, ) -> List[Dict]: - ''' + """ helper function to return serialized data from html - ''' + """ return df_serialize_table(html_to_df(html)[0], **other_data) diff --git a/eparse/interfaces.py b/eparse/interfaces.py index 80445ea..0aa7164 100644 --- a/eparse/interfaces.py +++ b/eparse/interfaces.py @@ -1,15 +1,15 @@ # -*- coding: utf-8 -*- -''' +""" excel parser interfaces -''' +""" +import importlib +import re from abc import abstractmethod from collections.abc import Iterable, Mapping from datetime import datetime -import importlib from pprint import PrettyPrinter -import re from typing import Dict, Optional from uuid import uuid4 @@ -19,23 +19,22 @@ CharField, DatabaseProxy, DateTimeField, - fn, IntegerField, Model, PostgresqlDatabase, SqliteDatabase, + fn, ) from .core import html_to_serialized_data - DATABASE = DatabaseProxy() class ExcelParse(Model): - ''' + """ excel parse model - ''' + """ id = AutoField() row = IntegerField() @@ -52,26 +51,26 @@ class ExcelParse(Model): @classmethod def get_queryset(cls, *args, **kwargs): - ''' + """ return queryset with filters applied - ''' + """ query = cls.filter(**kwargs) return pd.DataFrame(query.dicts()) @classmethod def get_column(cls, column, *args, **kwargs): - ''' + """ return distinct values from column with aggregations - ''' + """ query = ( cls.filter(**kwargs) .select( getattr(cls, column), - fn.COUNT(cls.id).alias('Total Rows'), - fn.COUNT(cls.type.distinct()).alias('Data Types'), - fn.COUNT(cls.value.distinct()).alias('Distinct Values'), + fn.COUNT(cls.id).alias("Total Rows"), + fn.COUNT(cls.type.distinct()).alias("Data Types"), + fn.COUNT(cls.value.distinct()).alias("Distinct Values"), ) .group_by(getattr(cls, column)) ) @@ -79,20 +78,20 @@ def get_column(cls, column, *args, **kwargs): class Meta: database = DATABASE - indexes = ((('f_name', 'sheet', 'name'), False),) + indexes = ((("f_name", "sheet", "name"), False),) class BaseInterface: - ''' + """ base interface class - ''' + """ - endpoint = '' - user = '' - password = '' - host = '' + endpoint = "" + user = "" + password = "" + host = "" port = 0 - name = '' + name = "" Database = None Model = None @@ -104,42 +103,42 @@ def __init__(self, uri: str, Model: Optional[Model] = None): @abstractmethod def input(self): - ''' + """ from_X override with input handler - ''' + """ pass @abstractmethod def output(self, data: pd.DataFrame, obj: Dict) -> pd.DataFrame: - ''' + """ to_X override with output handler - ''' + """ pass @abstractmethod def migrate(self, migration: str): - ''' + """ override with migration handler - ''' + """ pass @classmethod def parse_uri(self, uri: str) -> Dict: - ''' + """ parse eparse URI string - ''' + """ - patt = r'^(?P.*)://(?P.*?)(:(?P.*?))?(@(?P.*?)(:(?P.*?))?)?/(?P.*)?$' # noqa + patt = r"^(?P.*)://(?P.*?)(:(?P.*?))?(@(?P.*?)(:(?P.*?))?)?/(?P.*)?$" # noqa return re.match(patt, uri).groupdict() class NullInterface(BaseInterface): - ''' + """ null interface - ''' + """ def input(self): return pd.DataFrame() @@ -152,9 +151,9 @@ def migrate(self, *args, **kwargs): class StdoutInterface(BaseInterface): - ''' + """ stdout interface - ''' + """ def input(self): return pd.DataFrame() @@ -167,15 +166,15 @@ def migrate(self, *args, **kwargs): class BaseDatabaseInterface(BaseInterface): - ''' + """ base database interface - ''' + """ @abstractmethod def initialize(self, *args, **kwargs): - ''' + """ override with db-specific initialization - ''' + """ pass @@ -185,8 +184,8 @@ def input(self, method, **kwargs): # if no explicit method is available, try get_column if m is None: m = self.Model.get_column - patt = r'^(?:get_)?(?P.*)$' - kwargs['column'] = re.match(patt, method).group('column') + patt = r"^(?:get_)?(?P.*)$" + kwargs["column"] = re.match(patt, method).group("column") self.initialize(DATABASE) DATABASE.connect() @@ -195,9 +194,9 @@ def input(self, method, **kwargs): def output(self, data, *args, **kwargs): # skip empty data - if hasattr(data, 'empty') and data.empty: + if hasattr(data, "empty") and data.empty: return - elif not hasattr(data, 'empty') and not data: + elif not hasattr(data, "empty") and not data: return # check that data is serialized @@ -205,7 +204,7 @@ def output(self, data, *args, **kwargs): assert isinstance(data, Iterable) assert isinstance(data[0], Mapping) except Exception: - raise ValueError('bad data - did you serialize it first?') + raise ValueError("bad data - did you serialize it first?") self.initialize(DATABASE) DATABASE.connect() @@ -219,10 +218,10 @@ def output(self, data, *args, **kwargs): def migrate(self, migration): try: - m = importlib.import_module('eparse.migrations') + m = importlib.import_module("eparse.migrations") migration_fcn = getattr(m, migration) except AttributeError: - msg = f'migration error - there is no {migration}' + msg = f"migration error - there is no {migration}" raise AttributeError(msg) self.initialize(DATABASE) @@ -231,24 +230,24 @@ def migrate(self, migration): class Sqlite3Interface(BaseDatabaseInterface): - ''' + """ sqlite3 interface - ''' + """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if not self.name: - self.name = f'.files/{uuid4()}.db' + self.name = f".files/{uuid4()}.db" def initialize(self, db): db.initialize(SqliteDatabase(self.name)) class PostgresInterface(BaseDatabaseInterface): - ''' + """ postgres interface - ''' + """ def initialize(self, db): db.initialize( @@ -263,9 +262,9 @@ def initialize(self, db): class HtmlInterface(Sqlite3Interface): - ''' + """ html data interface using sqlite3 - ''' + """ def __init__(self, *args, html: Optional[str] = None, **kwargs): super().__init__(*args, **kwargs) @@ -275,20 +274,20 @@ def __init__(self, *args, html: Optional[str] = None, **kwargs): def i_factory(uri, Model=None, **kwargs): - ''' + """ return interface object based on uri - ''' + """ - if uri.startswith('null'): + if uri.startswith("null"): return NullInterface(uri, **kwargs) - elif uri.startswith('stdout'): + elif uri.startswith("stdout"): return StdoutInterface(uri, **kwargs) - elif uri.startswith('sqlite3'): + elif uri.startswith("sqlite3"): return Sqlite3Interface(uri, Model, **kwargs) - elif uri.startswith('postgres'): + elif uri.startswith("postgres"): return PostgresInterface(uri, Model, **kwargs) - elif uri.startswith('html'): - _uri = uri.replace('html', 'sqlite3') + elif uri.startswith("html"): + _uri = uri.replace("html", "sqlite3") return HtmlInterface(_uri, Model, **kwargs) - raise ValueError(f'{uri} is not a recognized endpoint') + raise ValueError(f"{uri} is not a recognized endpoint") diff --git a/eparse/migrations.py b/eparse/migrations.py index e20c864..29c3cc8 100644 --- a/eparse/migrations.py +++ b/eparse/migrations.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- -''' +""" excel parser database migrations -''' +""" -from playhouse.migrate import migrate, SchemaMigrator +from playhouse.migrate import SchemaMigrator, migrate def migration_000102_000200(model): - ''' + """ database migration from 0.1.2 to 0.2.0 - ''' + """ database = model._meta.database.obj timestamp_field = model.timestamp @@ -21,44 +21,44 @@ def migration_000102_000200(model): migrate( # table column_name new_field migrator.add_column( - 'excelparse', - 'timestamp', + "excelparse", + "timestamp", timestamp_field, ), # table column_name(s) unique migrator.add_index( - 'excelparse', - ('c_header',), + "excelparse", + ("c_header",), False, ), migrator.add_index( - 'excelparse', - ('r_header',), + "excelparse", + ("r_header",), False, ), migrator.add_index( - 'excelparse', - ('excel_RC',), + "excelparse", + ("excel_RC",), False, ), migrator.add_index( - 'excelparse', - ('name',), + "excelparse", + ("name",), False, ), migrator.add_index( - 'excelparse', - ('sheet',), + "excelparse", + ("sheet",), False, ), migrator.add_index( - 'excelparse', - ('f_name',), + "excelparse", + ("f_name",), False, ), migrator.add_index( - 'excelparse', - ('f_name', 'sheet', 'name'), + "excelparse", + ("f_name", "sheet", "name"), False, ), ) diff --git a/requirements.txt b/requirements.txt index 61e05be..c587fc5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ # base -click==8.1.3 -openpyxl==3.1.2 -lxml==4.9.3 -pandas==2.0.1 -peewee==3.16.2 -unstructured==0.10.5 +click==8.1.7 +openpyxl==3.1.5 +lxml==5.2.2 +pandas==2.2.2 +peewee==3.17.6 # devtest -black==23.3.0 -coverage==7.2.7 -flake8==6.0.0 -ipython==8.12.2 -pytest==7.3.1 -tox==4.6.0 +black>=23.3.0 +coverage>=7.2.7 +flake8>=6.0.0 +ipython>=8.12.2 +pre-commit>=3.7.1 +pytest>=7.3.1 +tox>=4.6.0 diff --git a/setup.cfg b/setup.cfg index d5fa518..ed46b6f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,9 +3,14 @@ universal = 1 [flake8] exclude = docs tests +max-line-length = 100 per-file-ignores = eparse/contrib/unstructured/partition.py:E501 eparse/contrib/unstructured/xlsx.py:E501 +[isort] +profile=hug +src_paths=eparse,tests + [tool:pytest] addopts = --ignore=setup.py diff --git a/setup.py b/setup.py index 601fd1a..15c0b07 100644 --- a/setup.py +++ b/setup.py @@ -1,70 +1,68 @@ #!/usr/bin/env python -''' +""" setup script for eparse -''' +""" -from setuptools import setup, find_packages +from setuptools import find_packages, setup - -with open('README.rst') as readme_file: +with open("README.rst") as readme_file: readme = readme_file.read() -with open('HISTORY.rst') as history_file: +with open("HISTORY.rst") as history_file: history = history_file.read() requirements = [ - 'click>=8.0.0', - 'openpyxl>=3.0.0', - 'lxml>=4.9.3', - 'pandas>=2.0.0', - 'peewee>=3.16.0', - 'unstructured>=0.8.5', + "click>=8.0.0", + "openpyxl>=3.0.0", + "lxml>=4.9.3", + "pandas>=2.2.0", + "peewee>=3.16.0", ] test_requirements = [ - 'black', - 'coverage', - 'flake8', - 'ipython', - 'pytest', - 'tox', + "black", + "coverage", + "flake8", + "ipython", + "pytest", + "tox", ] setup( - author='Chris Pappalardo', - author_email='cpappala@gmail.com', - python_requires='>=3.8', + author="Chris Pappalardo", + author_email="cpappala@gmail.com", + python_requires=">=3.8", classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: End Users/Desktop', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], - description=''' + description=""" Excel spreadsheet crawler and table parser for data discovery, extraction, and querying - ''', + """, entry_points={ - 'console_scripts': [ - 'eparse=eparse.cli:entry_point', + "console_scripts": [ + "eparse=eparse.cli:entry_point", ], }, install_requires=requirements, - license='MIT license', - long_description=readme + '\n\n' + history, + license="MIT license", + long_description=readme + "\n\n" + history, include_package_data=True, - keywords='eparse', - name='eparse', - packages=find_packages(include=['eparse', 'eparse.*']), - test_suite='tests', + keywords="eparse", + name="eparse", + packages=find_packages(include=["eparse", "eparse.*"]), + test_suite="tests", tests_require=test_requirements, - url='https://github.com/ChrisPappalardo/eparse', - version='0.7.3', + url="https://github.com/ChrisPappalardo/eparse", + version="0.7.4", zip_safe=False, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0904da1..65f57af 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,70 +1,69 @@ # -*- coding: utf-8 -*- -''' +""" unit tests for eparse cli -''' +""" -from click.testing import CliRunner import pytest +from click.testing import CliRunner from eparse.cli import main - -kwargs = {'obj': {}, 'catch_exceptions': False} +kwargs = {"obj": {}, "catch_exceptions": False} def test_main(): runner = CliRunner() - result = runner.invoke(main, ['--help']) + result = runner.invoke(main, ["--help"]) assert result.exit_code == 0 - assert 'Usage' in result.output + assert "Usage" in result.output def test_scan(): runner = CliRunner() - result = runner.invoke(main, ['-f', 'tests/', 'scan'], **kwargs) + result = runner.invoke(main, ["-f", "tests/", "scan"], **kwargs) assert result.exit_code == 0 - assert 'eparse_unit_test_data' in result.output + assert "eparse_unit_test_data" in result.output def test_parse(): runner = CliRunner() - result = runner.invoke(main, ['-f', 'tests/', 'parse'], **kwargs) + result = runner.invoke(main, ["-f", "tests/", "parse"], **kwargs) assert result.exit_code == 0 - assert 'eparse_unit_test_data' in result.output + assert "eparse_unit_test_data" in result.output def test_query(): runner = CliRunner() - result = runner.invoke(main, ['-i', 'sqlite3:///tests/test.db', 'query'], **kwargs) + result = runner.invoke(main, ["-i", "sqlite3:///tests/test.db", "query"], **kwargs) assert result.exit_code == 0 - assert result.output == '' + assert result.output == "" def test_migrate(): runner = CliRunner() result = runner.invoke( main, - ['-i', 'sqlite3:///tests/test.db', 'migrate', '-m', 'migration_000102_000200'], + ["-i", "sqlite3:///tests/test.db", "migrate", "-m", "migration_000102_000200"], **kwargs ) assert result.exit_code == 1 - assert 'duplicate column name: timestamp' in result.output + assert "duplicate column name: timestamp" in result.output def test_outputs(): runner = CliRunner() - result = runner.invoke(main, ['-o', 'null:///', 'scan'], **kwargs) + result = runner.invoke(main, ["-o", "null:///", "scan"], **kwargs) assert result.exit_code == 0 - assert result.output == '' - result = runner.invoke(main, ['-o', 'stdout:///', 'scan'], **kwargs) + assert result.output == "" + result = runner.invoke(main, ["-o", "stdout:///", "scan"], **kwargs) assert result.exit_code == 0 - assert result.output == '' - result = runner.invoke(main, ['-o', 'sqlite3:///:memory:', 'scan'], **kwargs) + assert result.output == "" + result = runner.invoke(main, ["-o", "sqlite3:///:memory:", "scan"], **kwargs) assert result.exit_code == 0 - assert result.output == '' - result = runner.invoke(main, ['-o', 'test', 'scan'], **kwargs) + assert result.output == "" + result = runner.invoke(main, ["-o", "test", "scan"], **kwargs) assert result.exit_code == 1 - assert 'test is not a recognized endpoint' in result.output + assert "test is not a recognized endpoint" in result.output with pytest.raises(ValueError): - result = runner.invoke(main, ['-d', '-o', 'test', 'scan'], **kwargs) + result = runner.invoke(main, ["-d", "-o", "test", "scan"], **kwargs) diff --git a/tests/test_core.py b/tests/test_core.py index cfd54e6..fcb394a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -''' +""" unit tests for eparse core -''' +""" import pandas as pd @@ -20,20 +20,20 @@ def test_df_find_tables(xlsx): t = df_find_tables(xlsx) assert len(t) == 2 - assert (2, 2, 'C3', 'ID') in t + assert (2, 2, "C3", "ID") in t def test_df_find_tables_loose(xlsx): t = df_find_tables(xlsx, loose=True) assert len(t) == 10 - assert (2, 2, 'C3', 'ID') in t - assert (102, 2, 'C103', 'Schedule of Principal Repayments:') in t + assert (2, 2, "C3", "ID") in t + assert (102, 2, "C103", "Schedule of Principal Repayments:") in t def test_df_parse_table(xlsx): t = df_parse_table(xlsx, 102, 2) assert t.shape == (11, 8) - assert t.iloc[0, 2] == 'Date' + assert t.iloc[0, 2] == "Date" def test_df_parse_table_na_tolerance(xlsx): @@ -51,17 +51,17 @@ def test_df_parse_table_na_tolerance(xlsx): def test_df_serialize_table(xlsx): - t = df_serialize_table(df_parse_table(xlsx, 102, 2), foo='bar') + t = df_serialize_table(df_parse_table(xlsx, 102, 2), foo="bar") assert len(t) == 11 * 8 assert isinstance(t[22], dict) - assert 'c_header' in t[22].keys() - assert t[22]['c_header'] == 'Date' + assert "c_header" in t[22].keys() + assert t[22]["c_header"] == "Date" def test_get_df_from_file(): - filename = 'tests/eparse_unit_test_data.xlsx' + filename = "tests/eparse_unit_test_data.xlsx" df_a, *_ = next(get_df_from_file(filename)) - with open(filename, 'rb') as file: + with open(filename, "rb") as file: df_b, *_ = next(get_df_from_file(file)) assert isinstance(df_a, pd.DataFrame) assert isinstance(df_b, pd.DataFrame) @@ -71,13 +71,13 @@ def test_get_df_from_file(): def test_get_table_digest(xlsx): parse = df_parse_table(xlsx, 26, 1) serialized_table = df_serialize_table(parse) - digest = get_table_digest(serialized_table, table_name='Financials') + digest = get_table_digest(serialized_table, table_name="Financials") assert isinstance(digest, str) - assert f'{parse.shape[1]} column(s)' in digest - assert f'{parse.shape[0]} row(s)' in digest - assert 'Last Price Discovery: 03/01/2022' in digest - assert 'Interest Expense' in digest - assert 'float' in digest + assert f"{parse.shape[1]} column(s)" in digest + assert f"{parse.shape[0]} row(s)" in digest + assert "Last Price Discovery: 03/01/2022" in digest + assert "Interest Expense" in digest + assert "float" in digest def test_html_to_df_and_serialized_data(xlsx): diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 12e6153..3626afd 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -1,95 +1,95 @@ # -*- coding: utf-8 -*- -''' +""" unit tests for eparse interfaces -''' +""" import pandas as pd from peewee import SqliteDatabase from eparse.interfaces import ( DATABASE, - ExcelParse, BaseInterface, + ExcelParse, + HtmlInterface, NullInterface, - StdoutInterface, Sqlite3Interface, - HtmlInterface, + StdoutInterface, i_factory, ) def test_sqlite3_db(sqlite3_db): assert isinstance(sqlite3_db, SqliteDatabase) - assert DATABASE.table_exists('excelparse') + assert DATABASE.table_exists("excelparse") assert len(ExcelParse.select()) == 1 def test_ExcelParse_model(sqlite3_db): assert ExcelParse.get_queryset().shape == (1, 12) - assert ExcelParse.get_column('c_header').shape == (1, 4) + assert ExcelParse.get_column("c_header").shape == (1, 4) def test_null_interface(): - obj = i_factory('null:///') + obj = i_factory("null:///") assert isinstance(obj, NullInterface) assert obj.input().empty assert obj.output(True) is None def test_stdout_interface(): - obj = i_factory('stdout:///') + obj = i_factory("stdout:///") assert isinstance(obj, StdoutInterface) assert obj.input().empty - assert obj.output({'foo': 1}) == None + assert obj.output({"foo": 1}) is None def test_sqlite3_interface(data, ctx): - obj = i_factory('sqlite3:///:memory:', ExcelParse) + obj = i_factory("sqlite3:///:memory:", ExcelParse) obj.output([], ctx) obj.output([data], ctx) assert isinstance(obj, Sqlite3Interface) - assert DATABASE.table_exists('excelparse') + assert DATABASE.table_exists("excelparse") assert len(ExcelParse.select()) == 1 def test_html_interface(data, ctx): - html = pd.DataFrame.from_records([data]).to_html() - obj = i_factory('html:///:memory:', ExcelParse) + pd.DataFrame.from_records([data]).to_html() + obj = i_factory("html:///:memory:", ExcelParse) obj.output([], ctx) obj.output([data], ctx) assert isinstance(obj, HtmlInterface) assert isinstance(obj, Sqlite3Interface) - assert DATABASE.table_exists('excelparse') + assert DATABASE.table_exists("excelparse") assert len(ExcelParse.select()) == 1 def test_parse_uri(): - db = 'endpoint://user:password@host:port/name' + db = "endpoint://user:password@host:port/name" p = BaseInterface.parse_uri(db) - keys = ('endpoint', 'user', 'password', 'host', 'port', 'name') + keys = ("endpoint", "user", "password", "host", "port", "name") not_keys = () assert all([k in p.keys() for k in (keys + not_keys)]) assert all([k in p.values() for k in keys]) assert all([k not in p.values() for k in not_keys]) - db = 'endpoint://user@host/name' + db = "endpoint://user@host/name" p = BaseInterface.parse_uri(db) - keys = ('endpoint', 'user', 'host', 'name') - not_keys = ('password', 'port') + keys = ("endpoint", "user", "host", "name") + not_keys = ("password", "port") assert all([k in p.keys() for k in (keys + not_keys)]) assert all([k in p.values() for k in keys]) assert all([k not in p.values() for k in not_keys]) - db = 'endpoint:///name' + db = "endpoint:///name" p = BaseInterface.parse_uri(db) - keys = ('endpoint', 'name') - not_keys = ('user', 'password', 'host', 'port') + keys = ("endpoint", "name") + not_keys = ("user", "password", "host", "port") assert all([k in p.keys() for k in (keys + not_keys)]) assert all([k in p.values() for k in keys]) assert all([k not in p.values() for k in not_keys]) - db = 'endpoint:///' + db = "endpoint:///" p = BaseInterface.parse_uri(db) - keys = ('endpoint',) - not_keys = ('user', 'password', 'host', 'port', 'name') + keys = ("endpoint",) + not_keys = ("user", "password", "host", "port", "name") assert all([k in p.keys() for k in (keys + not_keys)]) assert all([k in p.values() for k in keys]) assert all([k not in p.values() for k in not_keys])