package working again

ropensci · Apr 11, 2024 · 581cad8 · 581cad8
1 parent bfc79cb
commit 581cad8
Show file tree

Hide file tree

Showing 38 changed files with 705 additions and 368 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,26 +1,26 @@
-Contributions to **tabulizer** are welcome from anyone and are best sent as pull requests on [the GitHub repository](https://github.com/leeper/tabulizer/). This page provides some instructions to potential contributors about how to add to the package.
+Contributions to **tabulapdf** are welcome from anyone and are best sent as pull requests on [the GitHub repository](https://github.com/leeper/tabulapdf/). This page provides some instructions to potential contributors about how to add to the package.
 
- 1. Contributions can be submitted as [a pull request](https://help.github.com/articles/creating-a-pull-request/) on GitHub by forking or cloning the [repo](https://github.com/leeper/tabulizer/), making changes and submitting the pull request.
+ 1. Contributions can be submitted as [a pull request](https://help.github.com/articles/creating-a-pull-request/) on GitHub by forking or cloning the [repo](https://github.com/leeper/tabulapdf/), making changes and submitting the pull request.
 
  2. Pull requests should involve only one commit per substantive change. This means if you change multiple files (e.g., code and documentation), these changes should be committed together. If you don't know how to do this (e.g., you are making changes in the GitHub web interface) just submit anyway and the maintainer will clean things up.
 
  3. All contributions must be submitted consistent with the package license ([MIT](https://opensource.org/licenses/MIT)).
 
- 4. Non-trivial contributions need to be noted in the `Authors@R` field in the [DESCRIPTION](https://github.com/leeper/tabulizer/blob/master/DESCRIPTION). Just follow the format of the existing entries to add your name (and, optionally, email address). Substantial contributions should also be noted in [`inst/CITATION`](https://github.com/leeper/tabulizer/blob/master/inst/CITATION).
+ 4. Non-trivial contributions need to be noted in the `Authors@R` field in the [DESCRIPTION](https://github.com/leeper/tabulapdf/blob/master/DESCRIPTION). Just follow the format of the existing entries to add your name (and, optionally, email address). Substantial contributions should also be noted in [`inst/CITATION`](https://github.com/leeper/tabulapdf/blob/master/inst/CITATION).
 
  5. The project uses royxgen code and documentation markup, so changes should be made to roxygen comments in the source code `.R` files. If changes are made, roxygen needs to be run. The easiest way to do this is a command line call to: `Rscript -e devtools::document()`. Please resolve any roxygen errors before submitting a pull request.
 
- 6. Please run `R CMD BUILD tabulizer` and `R CMD CHECK tabulizer_VERSION.tar.gz` before submitting the pull request to check for any errors.
+ 6. Please run `R CMD BUILD tabulapdf` and `R CMD CHECK tabulapdf_VERSION.tar.gz` before submitting the pull request to check for any errors.
 
 Some specific types of changes that you might make are:
 
  1. Bug fixes. Great!
 
  2. Documentation-only changes (e.g., to Rd files, README, vignettes). This is great! All contributions are welcome.
 
- 3. New functionality. This is fine, but should be discussed on [the GitHub issues page](https://github.com/leeper/tabulizer/issues) before submitting a pull request.
+ 3. New functionality. This is fine, but should be discussed on [the GitHub issues page](https://github.com/leeper/tabulapdf/issues) before submitting a pull request.
 
- 3. Changes requiring a new package dependency should also be discussed on [the GitHub issues page](https://github.com/leeper/tabulizer/issues) before submitting a pull request.
+ 3. Changes requiring a new package dependency should also be discussed on [the GitHub issues page](https://github.com/leeper/tabulapdf/issues) before submitting a pull request.
 
  4. Message translations. These are very appreciated! The format is a pain, but if you're doing this I'm assuming you're already familiar with it.
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,38 +1,36 @@
-Package: tabulizer
+Package: tabulapdf
 Type: Package
-Title: Bindings for 'Tabula' PDF Table Extractor Library
-Version: 0.2.3
+Title: Extract tables from PDF documents
+Version: 1.0.5
 Authors@R: c(person("Thomas J.", "Leeper",
                     role = "aut", 
                     email = "thosjleeper@gmail.com",
                     comment = c(ORCID = "0000-0003-4097-6326")),
-             person("Mauricio", "Vargas",
+             person("Mauricio", "Vargas Sepulveda",
                     role = c("aut","cre"), 
-                    email = "hello+r@pacha.dev", 
+                    email = "m.sepulveda@mail.utoronto.ca", 
                     comment = c(ORCID = "0000-0003-1017-7574")),
              person("Tom", "Paskhalis",
                     role = "aut",
                     email = "tpaskhalis@gmail.com",
                     comment = c(ORCID = "0000-0001-9298-8850")),
+             person("Manuel", "Aristaran",
+                    role = "ctb"),
              person("David", "Gohel",
                     role = "ctb",
                     comment = "rOpenSci reviewer"),
              person("Lincoln", "Mullen",
                     role = "ctb",
                     comment = "rOpenSci reviewer"))
 Description: Bindings for the 'Tabula' <http://tabula.technology/> 'Java'
-    library, which can extract tables from PDF documents. The 'tabulizerjars'
-    package <https://github.com/ropensci/tabulizerjars> provides versioned
-    'Java' .jar files, including all dependencies, aligned to releases of
-    'Tabula'.
+    library, which can extract tables from PDF documents.
 License: MIT + file LICENSE
-URL: https://docs.ropensci.org/tabulizer (website)
-    https://github.com/ropensci/tabulizer
-BugReports: https://github.com/ropensci/tabulizer/issues
+URL: https://docs.ropensci.org/tabulapdf (website)
+    https://github.com/ropensci/tabulapdf
+BugReports: https://github.com/ropensci/tabulapdf/issues
 Imports:
     png,
     rJava,
-    tabulizerjars,
     tools,
     utils
 Suggests:
@@ -44,8 +42,10 @@ Suggests:
     testthat,
     rmarkdown,
     covr
-Remotes: ropensci/tabulizerjars
-SystemRequirements: Java (>= 7.0)
+SystemRequirements: Java (>= 7.0):
+    openjdk-11-jdk (deb),
+    java-11-openjdk.x86_64 (rpm),
+    openjdk@11 (brew)
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.2.1
+RoxygenNote: 7.3.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,7 +11,6 @@ export(make_thumbnails)
 export(merge_pdfs)
 export(split_pdf)
 export(stop_logging)
-import(tabulizerjars)
 importFrom(grDevices,dev.capabilities)
 importFrom(grDevices,dev.off)
 importFrom(graphics,locator)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# CHANGES TO tabulapdf 1.0.5
+
+* Package renamed to `tabulapdf`
+* New maintainer: @pachadotdev
+* Updated to use tabula-java 1.0.5
+* Updated the methods in `extract_tables()`
+* The version now follows the version of tabula-java
+
 # CHANGES TO tabulizer 0.2.2
 
 * `extract_tables()` gets `outdir` argument for writing out CSV, TSV and JSON

diff --git a/R/extract_metadata.R b/R/extract_metadata.R
@@ -11,8 +11,8 @@
 #' @examples
 #' \dontrun{
 #' # simple demo file
-#' f <- system.file("examples", "data.pdf", package = "tabulizer")
-#' 
+#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
+#'
 #' extract_metadata(f)
 #' }
 #' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{extract_text}}, \code{\link{split_pdf}}
@@ -21,16 +21,18 @@
 extract_metadata <- function(file, password = NULL, copy = FALSE) {
     pdfDocument <- load_doc(file, password = password, copy = copy)
     on.exit(pdfDocument$close())
-    
+
     info <- pdfDocument$getDocumentInformation()
-    list(pages = pdfDocument$getNumberOfPages(),
-         title = info$getTitle(),
-         author = info$getAuthor(),
-         subject = info$getSubject(),
-         keywords = info$getKeywords(),
-         creator = info$getCreator(),
-         producer = info$getProducer(),
-         created = info$getCreationDate()$getTime()$toString(),
-         modified = info$getModificationDate()$getTime()$toString(),
-         trapped = info$getTrapped())
+    list(
+        pages = pdfDocument$getNumberOfPages(),
+        title = info$getTitle(),
+        author = info$getAuthor(),
+        subject = info$getSubject(),
+        keywords = info$getKeywords(),
+        creator = info$getCreator(),
+        producer = info$getProducer(),
+        created = info$getCreationDate()$getTime()$toString(),
+        modified = info$getModificationDate()$getTime()$toString(),
+        trapped = info$getTrapped()
+    )
 }
diff --git a/R/extract_tables.R b/R/extract_tables.R
@@ -35,7 +35,7 @@
 #' @examples
 #' \dontrun{
 #' # simple demo file
-#' f <- system.file("examples", "data.pdf", package = "tabulizer")
+#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
 #'
 #' # extract all tables
 #' extract_tables(f)
@@ -53,7 +53,6 @@
 #' extract_tables(f, pages = 2, output = "data.frame")
 #' }
 #' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}}
-#' @import tabulizerjars
 #' @importFrom utils read.delim download.file
 #' @importFrom tools file_path_sans_ext
 #' @importFrom rJava J new .jfloat .jcall
@@ -64,103 +63,105 @@ extract_tables <- function(file,
                            columns = NULL,
                            guess = TRUE,
                            method = c("decide", "lattice", "stream"),
-                           output = c("matrix", "data.frame", "character",
-                                      "asis", "csv", "tsv", "json"),
+                           output = c(
+                             "matrix", "data.frame", "character",
+                             "asis", "csv", "tsv", "json"
+                           ),
                            outdir = NULL,
                            password = NULL,
                            encoding = NULL,
                            copy = FALSE,
                            ...) {
-    method <- match.arg(method)
-    output <- match.arg(output)
+  method <- match.arg(method)
+  output <- match.arg(output)
 
-    if (isTRUE(guess) && (!is.null(area) || !is.null(columns))) warning("Argument guess is TRUE: arguments area and columns are ignored.")
-
-    if (is.null(outdir)) {
-      outdir <- normalizePath(tempdir())
-    } else {
-      outdir <- normalizePath(outdir)
-    }
+  if (isTRUE(guess) && (!is.null(area) || !is.null(columns))) warning("Argument guess is TRUE: arguments area and columns are ignored.")
 
-    pdfDocument <- load_doc(file, password = password, copy = copy)
-    on.exit(pdfDocument$close())
-    oe <- new(J("technology.tabula.ObjectExtractor"), pdfDocument)
+  if (is.null(outdir)) {
+    outdir <- normalizePath(tempdir())
+  } else {
+    outdir <- normalizePath(outdir)
+  }
 
-    # parse arguments
-    if (is.null(pages)) {
-        pageIterator <- oe$extract()
-    } else {
-        pages <- as.integer(pages)
-        pageIterator <- oe$extract(make_pages(pages))
-    }
-    npages <- pdfDocument$getNumberOfPages()
-    area <- make_area(area = area, pages = pages, npages = npages, target = "tabula")
-    columns <- make_columns(columns = columns, pages = pages, npages = npages)
+  pdfDocument <- load_doc(file, password = password, copy = copy)
+  on.exit(pdfDocument$close())
+  oe <- new(J("technology.tabula.ObjectExtractor"), pdfDocument)
 
-    # setup extractors
-    basicExtractor <- new(J("technology.tabula.extractors.BasicExtractionAlgorithm"))
-    spreadsheetExtractor <- new(J("technology.tabula.extractors.SpreadsheetExtractionAlgorithm"))
-    if (method == "lattice") {
-      use <- method
-    }
-    else if (method == "stream") {
-      use <- method
-    }
+  # parse arguments
+  if (is.null(pages)) {
+    pageIterator <- oe$extract()
+  } else {
+    pages <- as.integer(pages)
+    pageIterator <- oe$extract(make_pages(pages))
+  }
+  npages <- pdfDocument$getNumberOfPages()
+  area <- make_area(area = area, pages = pages, npages = npages, target = "tabula")
+  columns <- make_columns(columns = columns, pages = pages, npages = npages)
 
-    tables <- new(J("java.util.ArrayList"))
-    p <- 1L # page number
-    while (.jcall(pageIterator, "Z", "hasNext")) {
-        page <- .jcall(pageIterator, "Ljava/lang/Object;", "next")
+  # setup extractors
+  basicExtractor <- new(J("technology.tabula.extractors.BasicExtractionAlgorithm"))
+  spreadsheetExtractor <- new(J("technology.tabula.extractors.SpreadsheetExtractionAlgorithm"))
+  if (method == "lattice") {
+    use <- method
+  } else if (method == "stream") {
+    use <- method
+  }
 
-        if (!is.null(area[[p]])) {
-            page <- page$getArea(area[[p]])
-        }
+  tables <- new(J("java.util.ArrayList"))
+  p <- 1L # page number
+  while (.jcall(pageIterator, "Z", "hasNext")) {
+    page <- .jcall(pageIterator, "Ljava/lang/Object;", "next")
 
-        # decide whether to use spreadsheet or basic extractor
-        if (method == "decide") {
-            tabular <- spreadsheetExtractor$isTabular(page)
-            if (identical(FALSE, tabular)) {
-              use <- "stream"
-            } else {
-              use <- "lattice"
-            }
+    if (!is.null(area[[p]])) {
+      page <- page$getArea(area[[p]])
+    }
+
+    # decide whether to use spreadsheet or basic extractor
+    if (method == "decide") {
+      tabular <- spreadsheetExtractor$isTabular(page)
+      if (identical(FALSE, tabular)) {
+        use <- "stream"
+      } else {
+        use <- "lattice"
+      }
+    }
+    if (isTRUE(guess) && use == "lattice") {
+      tables$add(spreadsheetExtractor$extract(page))
+    } else {
+      if (isTRUE(guess)) {
+        # detect table locations
+        detector <- new(J("technology.tabula.detectors.NurminenDetectionAlgorithm"))
+        guesses <- detector$detect(page)
+        guessesIterator <- guesses$iterator()
+        while (.jcall(guessesIterator, "Z", "hasNext")) {
+          guessRect <- .jcall(guessesIterator, "Ljava/lang/Object;", "next")
+          thisGuess <- page$getArea(guessRect)
+          tables$add(basicExtractor$extract(thisGuess))
+          rm(thisGuess)
         }
-        if (isTRUE(guess) && use == "lattice") {
-            tables$add(spreadsheetExtractor$extract(page))
+      } else {
+        if (is.null(columns[[p]])) {
+          tables$add(basicExtractor$extract(page))
         } else {
-            if (isTRUE(guess)) {
-                # detect table locations
-                detector <- new(J("technology.tabula.detectors.NurminenDetectionAlgorithm"))
-                guesses <- detector$detect(page)
-                guessesIterator <- guesses$iterator()
-                while (.jcall(guessesIterator, "Z", "hasNext")) {
-                    guessRect <- .jcall(guessesIterator, "Ljava/lang/Object;", "next")
-                    thisGuess <- page$getArea(guessRect)
-                    tables$add(basicExtractor$extract(thisGuess))
-                    rm(thisGuess)
-                }
-            } else {
-                if (is.null(columns[[p]])) {
-                    tables$add(basicExtractor$extract(page))
-                } else {
-                    tables$add(basicExtractor$extract(page, columns[[p]]))
-                }
-            }
+          tables$add(basicExtractor$extract(page, columns[[p]]))
         }
-
-        rm(page)
-        p <- p + 1L # iterate page number
+      }
     }
-    rm(p)
 
-    # return output
-    switch(tolower(output),
-           "csv" = write_csvs(tables, file = file, outdir = outdir, ...),
-           "tsv" = write_tsvs(tables, file = file, outdir = outdir, ...),
-           "json" = write_jsons(tables, file = file, outdir = outdir, ...),
-           "character" = list_characters(tables, encoding = encoding, ...),
-           "matrix" = list_matrices(tables, encoding = encoding, ...),
-           "data.frame" = list_data_frames(tables, encoding = encoding, ...),
-           "asis" = tables,
-           tables)
+    rm(page)
+    p <- p + 1L # iterate page number
+  }
+  rm(p)
+
+  # return output
+  switch(tolower(output),
+    "csv" = write_csvs(tables, file = file, outdir = outdir, ...),
+    "tsv" = write_tsvs(tables, file = file, outdir = outdir, ...),
+    "json" = write_jsons(tables, file = file, outdir = outdir, ...),
+    "character" = list_characters(tables, encoding = encoding, ...),
+    "matrix" = list_matrices(tables, encoding = encoding, ...),
+    "data.frame" = list_data_frames(tables, encoding = encoding, ...),
+    "asis" = tables,
+    tables
+  )
 }