Don't use Tika for now, just extract text from HTML (Nokogiri), PDF, and plaintext.

davidmcclure · davidmcclure · commit 3f25597839c2 · 2014-12-29T15:47:58.000-06:00
diff --git a/Gemfile b/Gemfile
@@ -1,2 +1,3 @@
 source "https://rubygems.org"
-gem 'ruby-filemagic', '0.6.1'
+gem 'nokogiri'
+gem 'ruby-filemagic'
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,10 +1,16 @@
 GEM
   remote: https://rubygems.org/
   specs:
+    headless (1.0.2)
+    mini_portile (0.5.3)
+    nokogiri (1.6.1)
+      mini_portile (~> 0.5.0)
     ruby-filemagic (0.6.1)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  ruby-filemagic (= 0.6.1)
+  headless
+  nokogiri
+  ruby-filemagic
diff --git a/docs2csv.rb b/docs2csv.rb
@@ -15,6 +15,8 @@
 require 'optparse'
 require 'uri'
 require 'csv'
+require 'filemagic'
+require 'nokogiri'
 
 # ------------------------------------------- Modules, functions ----------------------------------------
 # text extraction, directory recursion, file matching
@@ -42,6 +44,11 @@ def extractTextFromPDF(filename, options)
   text
 end
 
+# Extract text from specified HTML.
+def extractTextFromHTML(filename)
+  Nokogiri::HTML(File.open(filename).read).text
+end
+
 # OCR a specific file.
 # Requires a tmp path to where the output file will be written (won't be deleted after use)
 # More or less just a tesseract call, but we turn on orientation detection.
@@ -98,15 +105,15 @@ def extractTextTika(filename)
 # extract text from specified file
 # Format dependent
 def extractTextFromFile(filename, options)
-  format = File.extname(filename)
-  if format == ".pdf"
+  mime = FileMagic.mime.file(filename)
+  if mime.start_with?("application/pdf")
     extractTextFromPDF(filename, options)
-  elsif format == ".jpg"
-    ocrImage(filename, options)
-  elsif format == ".txt"
+  elsif mime.start_with?("text/html")
+    extractTextFromHTML(filename)
+  elsif mime.start_with?("text/plain")
     File.open(filename).read
   else
-    extractTextTika(filename)
+    false
   end
 end
 
@@ -163,12 +170,17 @@ def processFile(filename, options)
     # - title, the filename (relative)
     # - url, an http://localhost:8000 URL to the relative path
     if options.process
-      text = cleanText(extractTextFromFile(filename, options))
-      title = filename
-      url = "http://localhost:8000/" + filename
-      uid = Digest::MD5.hexdigest(filename)
-
-      options.csv << [uid, text, title, url]
+      text = extractTextFromFile(filename, options)
+
+      if text
+        uid = Digest::MD5.hexdigest(filename)
+        text = cleanText(text)
+        title = filename
+        url = "http://localhost:8000/" + filename
+        options.csv << [uid, text, title, url]
+      else
+        STDERR.write "Skipping #{filename}\n"
+      end
     end
   rescue => error
     STDERR.write "Error processing #{filename}, skipping.\n"