diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index a562b32..5ce7a31 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -9,28 +9,27 @@ on: jobs: build-n-publish: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - - name: Install poetry - run: | - python -m pip install poetry - - - name: Build the package - run: | - poetry build - - - name: Publish package - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + - uses: actions/checkout@v2 + + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install poetry + run: | + python -m pip install poetry + + - name: Build the package + run: | + poetry build + + - name: Publish package + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml index a5736ab..18a73e0 100644 --- a/.github/workflows/tox.yml +++ b/.github/workflows/tox.yml @@ -2,86 +2,84 @@ name: Run tox tests on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: pytest: - runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [[3,7], [3,8], [3,9]] + python-version: [[3, 7], [3, 8], [3, 9]] os: [ubuntu-latest, macos-latest, windows-latest] steps: - - uses: actions/checkout@v2 - - - name: Set up Python ${{ join(matrix.python-version, '.') }} - uses: actions/setup-python@v2 - with: - python-version: ${{ join(matrix.python-version, '.') }} - - - name: Copy policy file for ubuntu-latest (needed to use ImageMagic in visual debugging tox tests) - run: | - sudo cp .github/workflows/policy.xml /etc/ImageMagick-6/policy.xml - if: matrix.os == 'ubuntu-latest' - - - name: Install ghostscript - run: | - sudo apt install ghostscript - if: matrix.os == 'ubuntu-latest' - - - name: Install ImageMagick on macos-latest - run: | - brew install freetype imagemagick - if: matrix.os == 'macos-latest' - - - name: Install poetry and tox - run: | - python -m pip install poetry tox - - - name: Run tox env pytest on Linux, macOS - run: | - tox -e py${{ join(matrix.python-version, '') }} - if: runner.os == 'Linux' || runner.os == 'macOS' - - # set the shell for Windows so env var expansion works in tox and subprocesses - - name: Run tox env pytest on Windows - run: | - tox -e py${{ join(matrix.python-version, '') }} - shell: cmd - if: runner.os == 'Windows' + - uses: actions/checkout@v2 + + - name: Set up Python ${{ join(matrix.python-version, '.') }} + uses: actions/setup-python@v2 + with: + python-version: ${{ join(matrix.python-version, '.') }} + + - name: Copy policy file for ubuntu-latest (needed to use ImageMagic in visual debugging tox tests) + run: | + sudo cp .github/workflows/policy.xml /etc/ImageMagick-6/policy.xml + if: matrix.os == 'ubuntu-latest' + + - name: Install ghostscript + run: | + sudo apt install ghostscript + if: matrix.os == 'ubuntu-latest' + + - name: Install ImageMagick on macos-latest + run: | + brew install freetype imagemagick + if: matrix.os == 'macos-latest' + + - name: Install poetry and tox + run: | + python -m pip install poetry tox + + - name: Run tox env pytest on Linux, macOS + run: | + tox -e py${{ join(matrix.python-version, '') }} + if: runner.os == 'Linux' || runner.os == 'macOS' + + # set the shell for Windows so env var expansion works in tox and subprocesses + - name: Run tox env pytest on Windows + run: | + tox -e py${{ join(matrix.python-version, '') }} + shell: cmd + if: runner.os == 'Windows' flake8_pylint_docs_black: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 - - name: Install poetry and tox - run: | - python -m pip install poetry tox + - name: Install poetry and tox + run: | + python -m pip install poetry tox - - name: Run tox env flake8 - run: | - tox -e flake8 + - name: Run tox env flake8 + run: | + tox -e flake8 - - name: Run tox env pylint - run: | - tox -e pylint + - name: Run tox env pylint + run: | + tox -e pylint - - name: Run tox env docs - run: | - tox -e docs + - name: Run tox env docs + run: | + tox -e docs - - name: Run tox env black - run: | - tox -e black + - name: Run tox env black + run: | + tox -e black diff --git a/README.rst b/README.rst index 1a9a9ff..ad3f1e9 100644 --- a/README.rst +++ b/README.rst @@ -4,4 +4,3 @@ Introduction ============ ``libpdf`` allows the extraction of structured data from machine readable PDFs. - diff --git a/libpdf/tables.py b/libpdf/tables.py index b974666..52fafea 100644 --- a/libpdf/tables.py +++ b/libpdf/tables.py @@ -79,7 +79,7 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): table_dict = {'page': {}} table_list = [] - + table_id = 1 for idx_page, page in enumerate( tqdm(pdf.pages, desc='###### Extracting tables', unit='pages', bar_format=bar_format_lvl2()), ): @@ -88,7 +88,6 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): if len((page.find_tables(table_settings))) != 0: table_dict['page'].update({idx_page + 1: []}) tables = page.find_tables(table_settings) - counter = 1 lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage for table in tables: # bbox in tables use pdfplumber bbox coordination (x0, top, y0, bottom), hence, need to @@ -112,7 +111,7 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): if _table_figure_check(table_pos, figure_list) is True: table_dict['page'][idx_page + 1].append( { - 'id': 'table.' + str(counter), + 'id': 'table.' + str(table_id), 'type': 'table', 'positions': table_pos, # 'text': table_temp.extract(2, 2), @@ -123,16 +122,16 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): cells = extract_cells( lt_page, table.rows, - table_dict['page'][idx_page + 1][counter - 1]['cell'], + table_dict['page'][idx_page + 1][len(table_dict['page'][idx_page + 1]) - 1]['cell'], pages_list[idx_page], ) - table = Table(idx=counter, cells=cells, position=table_pos) + table = Table(idx=table_id, cells=cells, position=table_pos) table_list.append(table) - counter += 1 + table_id += 1 - if counter == 1: # no table is added + if len(table_dict['page'][idx_page + 1]) == 0: # no table is added del table_dict['page'][idx_page + 1] return table_list