Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,17 @@ RUN apt-get update && apt-get install -y \
netcat \
rclone

RUN curl https://hathitrust.github.io/debian/hathitrust-archive-keyring.gpg -o /usr/share/keyrings/hathitrust-archive-keyring.gpg
RUN echo "deb [signed-by=/usr/share/keyrings/hathitrust-archive-keyring.gpg] https://hathitrust.github.io/debian/ bullseye main" > /etc/apt/sources.list.d/hathitrust.list

RUN apt-get update && apt-get install -y grokj2k-tools

RUN cpan -f -i Net::AMQP::RabbitMQ

ARG UNAME=ingest
ARG UID=1000
ARG GID=1000
ENV FEED_HOME=/usr/local/feed

RUN groupadd -g $GID -o $UNAME
RUN useradd -m -d $FEED_HOME -u $UID -g $GID -o -s /bin/bash $UNAME

Expand Down
15 changes: 6 additions & 9 deletions bin/compress_tif_jp2.pl
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,14 @@

# try to compress the TIFF -> JPEG2000
print("Compressing $infile to $outfile\n");
my $kdu_compress = get_config('kdu_compress');
die("You must correctly configure the path to kdu_compress in the feed configuration\n")
if not defined $kdu_compress or !-x $kdu_compress;
my $grk_compress = get_config('grk_compress');
die("You must correctly configure the path to grk_compress in the feed configuration\n")
if not defined $grk_compress or !-x $grk_compress;

# Settings for kdu_compress recommended from Roger Espinosa. "-slope"
# is a VBR compression mode; the value of 42988 corresponds to pre-6.4
# slope of 51180, the current (as of 5/6/2011) recommended setting for
# Google digifeeds.
system(qq($kdu_compress -quiet -i '$infile' -o '$outfile' Clevels=$levels Clayers=8 Corder=RLCP Cuse_sop=yes Cuse_eph=yes "Cmodes=RESET|RESTART|CAUSAL|ERTERM|SEGMARK" -no_weights -slope 42988))
# Single quality level with reqested PSNR of 32dB. See DEV-10
system(qq($grk_compress -i "$infile" -o "$outfile" -p RLCP -n $levels -SOP -EPH -M 62 -I -q 32))

and die("kdu_compress returned $?");
and die("grk_compress returned $?");

# then set new metadata fields: copy from exiftool field called
# IFD0:whatever to XMP-tiff:whatever, where the fields have the same name
Expand Down
5 changes: 0 additions & 5 deletions docker/base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,4 @@ RUN cpan \
Mozilla::CA

WORKDIR /tmp
RUN wget http://kakadusoftware.com/wp-content/uploads/KDU805_Demo_Apps_for_Linux-x86-64_200602.zip
RUN unzip -j -d kakadu KDU805_Demo_Apps_for_Linux-x86-64_200602.zip
RUN mv /tmp/kakadu/*.so /usr/local/lib
RUN mv /tmp/kakadu/kdu* /usr/local/bin
RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/kakadu.conf
RUN ldconfig
6 changes: 2 additions & 4 deletions etc/config/base_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,8 @@ jhove: /usr/bin/jhove
jhoveconf: /etc/jhove/jhove.conf
xerces: /usr/local/feed/bin/validateCache
xerces_cache: /usr/local/feed/etc/schema.cache
kdu_compress: /usr/local/bin/kdu_compress
kdu_expand: /usr/local/bin/kdu_expand
grk_compress: /usr/bin/grk_compress
grk_decompress: /usr/bin/grk_decompress
epubcheck: /usr/bin/java -jar /usr/bin/epubcheck
mp3val: /usr/bin/mp3val
# FIXME add to image
kdu_munge: TODO ADD TO IMAGE

14 changes: 7 additions & 7 deletions etc/config/premis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ premis_tools:
FEEDD: $FindBin::Script . " " . HTFeed::Version::get_vstring();
# Leave as-is.
EXIFTOOL: perl_mod_version('Image::ExifTool');
# FIXME check these versions
# FIXME get these versions from debian packages
# Set to the version of Xerces you are using (must be 3.0 or later)
XERCES: qq(Xerces-C 3.1)
JHOVE: qq(JHOVE 1.11)
XERCES: qq(Xerces-C 3.2.3)
JHOVE: qq(JHOVE 1.20.0)
# Leave as is.
DIGEST_MD5: perl_mod_version('Digest::MD5');
# GnuPG is not needed unless downloading packages from Google.
GPG: qq(GnuPG 2)
GPG: qq(GnuPG 2.2.27)
ZIP: qq(Zip 3.0)
KDU_COMPRESS: qq(kdu_compress 6.4.0)
EPUBCHECK: qq(epubcheck 4.0.2)
GRK_COMPRESS: qq(grok-j2k 9.7.5)
EPUBCHECK: qq(epubcheck 4.2.4)
MP3VAL: qq(mp3val 0.1.8)
CLAMAV: qq(ClamAV 0.103.5)

Expand Down Expand Up @@ -74,7 +74,7 @@ premis_events:
executor_type: HathiTrust Institution ID
tools:
- FEEDD
- KDU_COMPRESS
- GRK_COMPRESS
- EXIFTOOL

image_header_modification:
Expand Down
5 changes: 0 additions & 5 deletions lib/HTFeed/PackageType.pm
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,6 @@ BEGIN {
# don't migrate any events by default
migrate_events => {},

# use kdu_munge when remediating JPEG2000 images. disabled by default;
# exiftool should be able to handle extended-size JPEG2000 boxes as of
# version 10.03.
use_kdu_munge => 0,

# do not skip any validation by default
skip_validation => [],
skip_validation_note => '',
Expand Down
61 changes: 17 additions & 44 deletions lib/HTFeed/Stage/ImageRemediate.pm
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,7 @@ sub _remediate_jpeg2000 {
$self->set_new_if_undefined( $field, $val );
}

# first copy old values, since kdu_munge will strip the XMP if it is
# present
# first copy old values, since XMP may be stripped/corrupted in some cases
my $exifTool = new Image::ExifTool;
$exifTool->Options('ScanForXMP' => 1);
$exifTool->Options('IgnoreMinorErrors' => 1);
Expand All @@ -557,30 +556,7 @@ sub _remediate_jpeg2000 {
}
}

# Use kdu_munge to strip out the existing XMP. This may help in situations
# where exiftool cannot update existing XMP because it is in a "huge
# JPEG2000 box". Note that as of exiftool 10.03, exiftool should be able to
# handle "extended-size JPEG2000 boxes" so long as they are <4GB.
#
# kdu_munge is a modification to kdu_transcode; see
# http://websites.umich.edu/~roger/kdu_transcode.html
my $kdu_munge = get_config('kdu_munge');
if ( $self->{volume}->get_nspkg()->get('use_kdu_munge')
and defined $kdu_munge
and $kdu_munge )
{
system("$kdu_munge -i '$infile' -o '$outfile' > /dev/null 2>&1")
and $self->set_error(
"OperationFailed",
file => $outfile,
operation => "kdu_munge"
);

$self->update_tags( $exifTool, $outfile );
}
else {
$self->update_tags( $exifTool, $outfile, $infile );
}
$self->update_tags( $exifTool, $outfile, $infile );

}

Expand Down Expand Up @@ -729,37 +705,36 @@ sub expand_lossless_jpeg2000 {
my $jpeg2000_remediated = $file;
my $tiff = $file;
$tiff =~ s/\.jp2$/.tif/;
$jpeg2000_remediated =~ s/\.jp2$/.jp2_remediated/;
$jpeg2000_remediated =~ s/\.jp2$/.remediated.jp2/;

my $kdu_expand = get_config('kdu_expand');
system("$kdu_expand -i '$path/$jpeg2000' -o '$path/$tiff' > /dev/null 2>&1");
my $grk_decompress = get_config('grk_decompress');
system("$grk_decompress -i '$path/$jpeg2000' -o '$path/$tiff' > /dev/null 2>&1");


# try to compress the TIFF -> JPEG2000
get_logger()->trace("Compressing $path/$tiff to $path/$jpeg2000");
my $kdu_compress = get_config('kdu_compress');
my $grk_compress = get_config('grk_compress');

if(not defined $self->{recorded_image_compression}) {
$volume->record_premis_event('image_compression');
$self->{recorded_image_compression} = 1;
}

system(
"$kdu_compress -quiet -i '$path/$tiff' -o '$path/$jpeg2000_remediated' Clevels=5 Clayers=8 Corder=RLCP Cuse_sop=yes Cuse_eph=yes 'Cmodes=RESET|RESTART|CAUSAL|ERTERM|SEGMARK' -no_weights -slope 42988 > /dev/null 2>&1"
)
# Single quality level with reqested PSNR of 32dB. See DEV-10
system(qq($grk_compress -i "$path/$tiff" -o "$path/$jpeg2000_remediated" -p RLCP -n 5 -SOP -EPH -M 62 -I -q 32 > /dev/null 2>&1))

and $self->set_error(
"OperationFailed",
operation => "kdu_compress",
operation => "grk_compress",
file => "$path/$tiff",
detail => "kdu_compress returned $?"
detail => "grk_compress returned $?"
);


# copy all headers from the original jpeg2000
# kdu_compress loses info from IFD0 headers, which are sometimes present in JPEG2000 images
# grk_compress loses info from IFD0 headers, which are sometimes present in JPEG2000 images
my $exiftool = new Image::ExifTool;
$exiftool->SetNewValuesFromFile("$path/$jpeg2000");
$exiftool->SetNewValuesFromFile("$path/$jpeg2000",'*:*');
$exiftool->WriteInfo("$path/$jpeg2000_remediated");

rename("$path/$jpeg2000_remediated","$path/$jpeg2000");
Expand Down Expand Up @@ -883,14 +858,14 @@ sub convert_tiff_to_jpeg2000 {

# try to compress the TIFF -> JPEG2000
get_logger()->trace("Compressing $infile to $outfile");
my $kdu_compress = get_config('kdu_compress');
my $grk_compress = get_config('grk_compress');

if(not defined $self->{recorded_image_compression}) {
$volume->record_premis_event('image_compression');
$self->{recorded_image_compression} = 1;
}

# Settings for kdu_compress recommended from Roger Espinosa. "-slope"
# Settings for grk_compress recommended from Roger Espinosa. "-slope"
# is a VBR compression mode; the value of 42988 corresponds to pre-6.4
# slope of 51180, the current (as of 5/6/2011) recommended setting for
# Google digifeeds.
Expand Down Expand Up @@ -929,15 +904,13 @@ sub convert_tiff_to_jpeg2000 {
system( "$imagemagick_cmd -compress None $infile -strip $infile.unc.tif > /dev/null 2>&1" )
and $self->set_error("OperationFailed", operation => "imagemagick", file => $infile, detail => "decompress and ICC profile strip failed: returned $?");

system(
"$kdu_compress -quiet -i '$infile.unc.tif' -o '$outfile' Clevels=$levels Clayers=8 Corder=RLCP Cuse_sop=yes Cuse_eph=yes 'Cmodes=RESET|RESTART|CAUSAL|ERTERM|SEGMARK' -no_weights -slope 42988 > /dev/null 2>&1"
)
system(qq($grk_compress -i "$infile.unc.tif" -o "$outfile" -p RLCP -n $levels -SOP -EPH -M 62 -I > /dev/null 2>&1))

and $self->set_error(
"OperationFailed",
operation => "kdu_compress",
operation => "grk_compress",
file => $infile,
detail => "kdu_compress returned $?"
detail => "grk_compress returned $?"
);

# then set new metadata fields - the rest will automatically be
Expand Down
Binary file added t/fixtures/simple/test/lossless_jp2.zip
Binary file not shown.
Binary file added t/fixtures/simple/test/lossless_jp2_with_xmp.zip
Binary file not shown.
Binary file added t/fixtures/simple/test/rgb_tif.zip
Binary file not shown.
56 changes: 56 additions & 0 deletions t/local_ingest.t
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,62 @@ describe "HTFeed::PackageType::Simple" => sub {
ok($testlog->matches(qr(File validation failed.*meta\.yml)s));
}
};

describe "HTFeed::PackageType::Simple::ImageRemediate" => sub {
it "compresses tif to a valid jpeg2000" => sub {
my $volume = unpacked_volume("rgb_tif");
my $remediate = HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume);
$remediate->run();

ok(-e "$tmpdirs->{ingest}/rgb_tif/00000001.jp2");
ok($remediate->succeeded());

HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run();

my $validate = HTFeed::VolumeValidator->new(volume => $volume);
$validate->run();
ok($validate->succeeded());
};

it "preserves XMP values when compressing tif" => sub {
my $volume = unpacked_volume("rgb_tif");
my $remediate = HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume);
$remediate->run();

my $exiftool = Image::ExifTool->new();
$exiftool->ExtractInfo("$tmpdirs->{ingest}/rgb_tif/00000001.jp2");
is($exiftool->GetValue("XMP-tiff:Make"),"Test scanner make");
};

it "recompresses lossless jpeg2000 to a valid jpeg2000" => sub {
my $volume = unpacked_volume("lossless_jp2");

HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run();
HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run();

my $validate = HTFeed::VolumeValidator->new(volume => $volume);
$validate->run();;
ok($validate->succeeded());
};

it "preserves the XMP when recompressing a lossless JPEG2000" => sub {
# jp2 has artist & resolution fields in XMP; should preserve those
my $volume = unpacked_volume("lossless_jp2_with_xmp");
HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run();

HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run();

my $validate = HTFeed::VolumeValidator->new(volume => $volume);
$validate->run();
ok($validate->succeeded());

my $exiftool = Image::ExifTool->new();
$exiftool->ExtractInfo("$tmpdirs->{ingest}/lossless_jp2_with_xmp/00000001.jp2");
is($exiftool->GetValue("XMP-tiff:Make"),"Test scanner make");

};

};
};

describe "HTFeed::PackageType::Simple::Download" => sub {
Expand Down