Skip to content

Commit

Permalink
Merge pull request #21 from final-year-project/enhance/validate-pdfs-…
Browse files Browse the repository at this point in the history
…using-ghostscript

Remove PDFtk for aggregating and compressing PDFs
  • Loading branch information
macite authored Jul 5, 2016
2 parents b4781b2 + 7b9cb00 commit b623726
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 67 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ RUN apt-get install -y \
libpq-dev imagemagick \
libmagickwand-dev \
libmagic-dev \
pdftk \
libpq-dev \
python-pygments
python-pygments \
ghostscript

ADD . /doubtfire-api
WORKDIR /doubtfire-api
Expand Down
29 changes: 12 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,12 @@ CREATE ROLE itig WITH CREATEDB PASSWORD 'd872$dh' LOGIN;

#### 4. Install native tools

Install `imagemagick` and `libmagic` using Homebrew:
Install `imagemagick`, `libmagic` and `ghostscript` using Homebrew:

```
$ brew install imagemagick libmagic
$ brew install imagemagick libmagic ghostscript
```

You also need to download and install PDFtk manually by downloading it [here](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk_server-2.02-mac_osx-10.6-setup.pkg).

You will also need to install the Python `pygments` package:

```
Expand Down Expand Up @@ -271,7 +269,9 @@ $ rbenv install 2.0.0-p353
Install [Postgres](http://www.postgresql.org/download/linux/):

```
$ sudo apt-get install postgresql postgresql-contrib libpq-dev
$ sudo apt-get install postgresql \
postgresql-contrib \
libpq-dev
```

Ensure `pg_config` is on the `PATH`, and then login to Postgres. You will need to locate where `apt-get` has installed your Postgres binary and add this to your `PATH`. You can use `whereis psql` for that, but ensure you add the directory and not the executable to the path
Expand All @@ -295,18 +295,15 @@ CREATE ROLE itig WITH CREATEDB PASSWORD 'd872$dh' LOGIN;

#### 4. Install native tools

Install `imagemagick`, `libmagic` and `pdftk`:

```
$ sudo apt-get install imagemagick libmagickwand-dev
$ sudo apt-get install libmagic-dev
$ sudo apt-get install pdftk
```

You will also need to install the Python `pygments` package:
Install `imagemagick`, `libmagic` and `ghostscript`. You will also need to
install the Python `pygments` package:

```
$ sudo apt-get install python-pygments
$ sudo apt-get install ghostscript \
imagemagick \
libmagickwand-dev \
libmagic-dev \
python-pygments
```

#### 5. Install Doubtfire API dependencies
Expand Down Expand Up @@ -572,8 +569,6 @@ After installing LaTeX, you must ensure the following are listed on the `PATH`:
```
$ which convert
/usr/local/bin/convert
$ which pdftk
/usr/local/bin/pdftk
$ which pygmentize
/usr/local/bin/pygmentize
$ which pdflatex
Expand Down
71 changes: 27 additions & 44 deletions app/helpers/file_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,12 @@ def compress_image(path)
logger.debug "File helper has started compressing #{path} to #{tmp_file}..."

begin
exec = "#{Rails.root.join('lib', 'shell', 'timeout.sh')} -t 30 nice -n 10 convert \"#{path}\" -resize 1024x1024 \"#{tmp_file}\" >>/dev/null 2>>/dev/null"
# puts exec
exec = "convert \
\"#{path}\" \
-resize 1024x1024 \
\"#{tmp_file}\" >>/dev/null 2>>/dev/null"

# try with convert
did_compress = false
try_within 40, "compressing image" do
did_compress = system exec
end
did_compress = system_try_within 40, "compressing image using convert", exec

if did_compress
FileUtils.mv tmp_file, path
Expand All @@ -202,7 +200,6 @@ def compress_image(path)
end
end

# puts "#{did_compress}"
raise "Failed to compress an image. Ensure all images are smaller than 1MB." unless did_compress
return true
end
Expand All @@ -217,20 +214,30 @@ def compress_pdf(path, max_size = 2500000)
tmp_file = File.join( Dir.tmpdir, 'doubtfire', 'compress', "#{File.dirname(path).split(File::Separator).last}-file.pdf" )
FileUtils.mkdir_p(File.join( Dir.tmpdir, 'doubtfire', 'compress' ))

exec = "#{Rails.root.join('lib', 'shell', 'timeout.sh')} -t 30 nice -n 10 gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.3 -dDetectDuplicateImages=true -dPDFSETTINGS=/screen -dNOPAUSE -dBATCH -dQUIET -sOutputFile=\"#{tmp_file}\" \"#{path}\" >>/dev/null 2>>/dev/null"
exec = "gs -sDEVICE=pdfwrite \
-dCompatibilityLevel=1.3 \
-dDetectDuplicateImages=true \
-dPDFSETTINGS=/screen \
-dNOPAUSE \
-dBATCH \
-dQUIET \
-sOutputFile=\"#{tmp_file}\" \
\"#{path}\" \
>>/dev/null 2>>/dev/null"

# try with ghostscript
did_compress = system exec
did_compress = system_try_within 30, "compressing PDF using ghostscript", exec

if !did_compress
logger.info "Failed to compress PDF #{path} using GhostScript. Trying with convert"

exec = "#{Rails.root.join('lib', 'shell', 'timeout.sh')} -t 30 nice -n 10 convert \"#{path}\" -compress Zip \"#{tmp_file}\" >>/dev/null 2>>/dev/null"
exec = "convert \"#{path}\" \
-compress Zip \
\"#{tmp_file}\" \
>>/dev/null 2>>/dev/null"

# try with convert
try_within 40, "compressing PDF" do
did_compress = system exec
end
did_compress = system_try_within 40, "compressing PDF using convert", exec

if !did_compress
logger.error "Failed to compress PDF #{path} using convert. Cannot compress this PDF. Command was:\n\t#{exec}"
Expand Down Expand Up @@ -339,17 +346,13 @@ def convert_files_to_pdf(from_path, dest_path)
#
# Tests if a PDF is valid / corrupt
#
def pdf_valid?(file)
did_succeed = false

try_within 30, "validating PDF" do
did_succeed = system "nice -n 10 pdftk #{file} output /dev/null dont_ask"
unless did_succeed
logger.error "Failed to validate PDF file. Is pdftk installed?"
end
def pdf_valid? filename
# Scan last 1024 bytes for the EOF mark
return false unless File.exists? filename
File.open(filename) do |f|
f.seek -1024, IO::SEEK_END
f.read.include? '%%EOF'
end

did_succeed
end

#
Expand Down Expand Up @@ -484,25 +487,6 @@ def read_file_to_str(filename)
result
end

#
# Aggregate a list of PDFs into a single PDF file
# - returns boolean indicating success
#
def aggregate(pdf_paths, final_pdf_path)
logger.debug "Trying to aggregate PDFs to #{final_pdf_path}"

did_compile = false
exec = "nice -n 10 pdftk #{pdf_paths.join ' '} cat output '#{final_pdf_path}' dont_ask compress"
Terminator.terminate 180 do
did_compile = system exec
end

if !did_compile
logger.error "Failed to aggregate PDFs to #{final_pdf_path}. Command was:\n\t#{exec}"
end
did_compile
end

def path_to_plagarism_html(match_link)
to_dir = student_work_dir(:plagarism, match_link.task)

Expand Down Expand Up @@ -629,7 +613,6 @@ def extract_file_from_done(task, to_path, pattern, name_fn)
module_function :doc_to_pdf
module_function :cover_to_pdf
module_function :read_file_to_str
module_function :aggregate
module_function :path_to_plagarism_html
module_function :save_plagiarism_html
module_function :delete_plagarism_html
Expand Down
17 changes: 17 additions & 0 deletions app/helpers/timeout_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@ def try_within(sec, timeout_message = "operation", &block)
end
end

#
# Timeout system call
#
# Usage:
# system_try_within 30, "doing the thing", "gs do.pdf the.pdf thing.pdf"
#
def system_try_within(sec, timeout_message, command)
# shell script to kill command after timeout
timeout_exec = Rails.root.join('lib', 'shell', 'timeout.sh')
result = false
try_within sec, timeout_message do
result = system "#{timeout_exec} -t #{sec} nice -n 10 #{command}"
end
result
end

# Export functions as module functions
module_function :try_within
module_function :system_try_within
end
4 changes: 0 additions & 4 deletions app/models/project.rb
Original file line number Diff line number Diff line change
Expand Up @@ -953,8 +953,6 @@ def create_portfolio
fout.puts pdf_text
end

# FileHelper.compress_pdf(self.portfolio_path)

logger.info "Created portfolio at #{portfolio_path} - #{log_details()}"

self.portfolio_production_date = DateTime.now
Expand All @@ -964,9 +962,7 @@ def create_portfolio
logger.error "Failed to convert portfolio to PDF - #{log_details()} -\nError: #{e.message}"

log_file = e.message.scan(/\/.*\.log/).first
# puts "log file is ... #{log_file}"
if log_file && File.exists?(log_file)
# puts "exists"
begin
puts "--- Latex Log ---\n"
puts File.read(log_file)
Expand Down

0 comments on commit b623726

Please sign in to comment.