|
15 | 15 | require 'optparse'
|
16 | 16 | require 'uri'
|
17 | 17 | require 'csv'
|
| 18 | +require 'filemagic' |
| 19 | +require 'nokogiri' |
18 | 20 |
|
19 | 21 | # ------------------------------------------- Modules, functions ----------------------------------------
|
20 | 22 | # text extraction, directory recursion, file matching
|
@@ -42,6 +44,11 @@ def extractTextFromPDF(filename, options)
|
42 | 44 | text
|
43 | 45 | end
|
44 | 46 |
|
| 47 | +# Extract text from specified HTML. |
| 48 | +def extractTextFromHTML(filename) |
| 49 | + Nokogiri::HTML(File.open(filename).read).text |
| 50 | +end |
| 51 | + |
45 | 52 | # OCR a specific file.
|
46 | 53 | # Requires a tmp path to where the output file will be written (won't be deleted after use)
|
47 | 54 | # More or less just a tesseract call, but we turn on orientation detection.
|
@@ -98,15 +105,15 @@ def extractTextTika(filename)
|
98 | 105 | # extract text from specified file
|
99 | 106 | # Format dependent
|
100 | 107 | def extractTextFromFile(filename, options)
|
101 |
| - format = File.extname(filename) |
102 |
| - if format == ".pdf" |
| 108 | + mime = FileMagic.mime.file(filename) |
| 109 | + if mime.start_with?("application/pdf") |
103 | 110 | extractTextFromPDF(filename, options)
|
104 |
| - elsif format == ".jpg" |
105 |
| - ocrImage(filename, options) |
106 |
| - elsif format == ".txt" |
| 111 | + elsif mime.start_with?("text/html") |
| 112 | + extractTextFromHTML(filename) |
| 113 | + elsif mime.start_with?("text/plain") |
107 | 114 | File.open(filename).read
|
108 | 115 | else
|
109 |
| - extractTextTika(filename) |
| 116 | + false |
110 | 117 | end
|
111 | 118 | end
|
112 | 119 |
|
@@ -163,12 +170,17 @@ def processFile(filename, options)
|
163 | 170 | # - title, the filename (relative)
|
164 | 171 | # - url, an http://localhost:8000 URL to the relative path
|
165 | 172 | if options.process
|
166 |
| - text = cleanText(extractTextFromFile(filename, options)) |
167 |
| - title = filename |
168 |
| - url = "http://localhost:8000/" + filename |
169 |
| - uid = Digest::MD5.hexdigest(filename) |
170 |
| - |
171 |
| - options.csv << [uid, text, title, url] |
| 173 | + text = extractTextFromFile(filename, options) |
| 174 | + |
| 175 | + if text |
| 176 | + uid = Digest::MD5.hexdigest(filename) |
| 177 | + text = cleanText(text) |
| 178 | + title = filename |
| 179 | + url = "http://localhost:8000/" + filename |
| 180 | + options.csv << [uid, text, title, url] |
| 181 | + else |
| 182 | + STDERR.write "Skipping #{filename}\n" |
| 183 | + end |
172 | 184 | end
|
173 | 185 | rescue => error
|
174 | 186 | STDERR.write "Error processing #{filename}, skipping.\n"
|
|
0 commit comments