From 926d862b9363989d275c7ad8d244215529259a37 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 26 Apr 2024 12:05:36 +0200 Subject: [PATCH 1/2] Move creation of FileCompound into two different constructors. One is for simple file part only, the other is for multi part only. We also introduce a static helper function `openSinglePieceOrSplitFile` to build the FileCompound as before. This helper function is put in FileImpl which can seem a bit odd but will simplify next commits. --- src/file_compound.cpp | 45 +++++++++++++++++++++++++++---------------- src/file_compound.h | 4 ++++ src/fileimpl.cpp | 2 +- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/src/file_compound.cpp b/src/file_compound.cpp index 4653d19a8..763de65f5 100644 --- a/src/file_compound.cpp +++ b/src/file_compound.cpp @@ -21,6 +21,7 @@ #include "file_compound.h" #include +#include #include #include #include @@ -40,30 +41,40 @@ void FileCompound::addPart(FilePart* fpart) _fsize += fpart->size(); } +std::shared_ptr FileCompound::openSinglePieceOrSplitZimFile(const std::string& filename) { + try { + return std::make_shared(filename); + } catch (...) { + return std::make_shared(filename, FileCompound::MultiPartToken::Multi); + } +} + FileCompound::FileCompound(const std::string& filename): _filename(filename), _fsize(0) +{ + addPart(new FilePart(filename)); +} + +FileCompound::FileCompound(const std::string& base_filename, MultiPartToken _token): + _filename(base_filename), + _fsize(0) { try { - addPart(new FilePart(filename)); - } catch(...) { - int errnoSave = errno; - _fsize = zsize_t(0); - try { - for (char ch0 = 'a'; ch0 <= 'z'; ++ch0) + for (char ch0 = 'a'; ch0 <= 'z'; ++ch0) + { + const std::string fname0 = base_filename + ch0; + for (char ch1 = 'a'; ch1 <= 'z'; ++ch1) { - const std::string fname0 = filename + ch0; - for (char ch1 = 'a'; ch1 <= 'z'; ++ch1) - { - addPart(new FilePart(fname0 + ch1)); - } + addPart(new FilePart(fname0 + ch1)); } - } catch (...) { } - - if (empty()) - throw std::runtime_error(Formatter() - << "error " << errnoSave << " opening file \"" - << filename << "\""); + } + } catch (std::runtime_error& e) { + // This catch acts as a break for the double loop. + } + if (empty()) { + // We haven't found any part + throw std::runtime_error(Formatter() << "Error opening as a split file: " << base_filename); } } diff --git a/src/file_compound.h b/src/file_compound.h index 05fc13c38..5fe941599 100644 --- a/src/file_compound.h +++ b/src/file_compound.h @@ -25,6 +25,7 @@ #include "zim_types.h" #include "debug.h" #include +#include #include namespace zim { @@ -53,9 +54,12 @@ class FileCompound : private std::map { public: // types typedef const_iterator PartIterator; typedef std::pair PartRange; + enum class MultiPartToken { Multi }; public: // functions + static std::shared_ptr openSinglePieceOrSplitZimFile(const std::string& filename); explicit FileCompound(const std::string& filename); + explicit FileCompound(const std::string& filename, MultiPartToken token); #ifndef _WIN32 explicit FileCompound(int fd); diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 8b6e2c242..2341febd0 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -163,7 +163,7 @@ class Grouping // FileImpl // FileImpl::FileImpl(const std::string& fname) - : FileImpl(std::make_shared(fname)) + : FileImpl(FileCompound::openSinglePieceOrSplitZimFile(fname)) {} #ifndef _WIN32 From 48e64c7b961e6f1faf6dfdf1a88ed553b55819c5 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 26 Apr 2024 17:28:22 +0200 Subject: [PATCH 2/2] Make openSinglePieceOrSplitZimFile better handling single/multi part opening. Now, it tries to open as single piece only if not ending by `.zimaa`. If it cannot find the file, or if ending with `.zimaa`, it tries multi zim part. In any case, FileImpl constructor check for size and throw a ZimFileFormatError is file's size doesn't correspond to what is the header. --- src/file_compound.cpp | 16 ++++++++++++---- src/file_compound.h | 2 +- src/fileimpl.cpp | 13 +++++++++---- test/archive.cpp | 27 ++++++++++++++++++++++++++- 4 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/file_compound.cpp b/src/file_compound.cpp index 763de65f5..b97841434 100644 --- a/src/file_compound.cpp +++ b/src/file_compound.cpp @@ -41,12 +41,20 @@ void FileCompound::addPart(FilePart* fpart) _fsize += fpart->size(); } -std::shared_ptr FileCompound::openSinglePieceOrSplitZimFile(const std::string& filename) { +std::shared_ptr FileCompound::openSinglePieceOrSplitZimFile(std::string filename) { + std::shared_ptr fileCompound; + if (filename.size() > 6 && filename.substr(filename.size()-6) == ".zimaa") { + filename.resize(filename.size()-2); + } else { try { - return std::make_shared(filename); - } catch (...) { - return std::make_shared(filename, FileCompound::MultiPartToken::Multi); + fileCompound = std::make_shared(filename); + } catch(...) { } } + + if ( !fileCompound ) { + fileCompound = std::make_shared(filename, FileCompound::MultiPartToken::Multi); + } + return fileCompound; } FileCompound::FileCompound(const std::string& filename): diff --git a/src/file_compound.h b/src/file_compound.h index 5fe941599..a49de8869 100644 --- a/src/file_compound.h +++ b/src/file_compound.h @@ -57,7 +57,7 @@ class FileCompound : private std::map { enum class MultiPartToken { Multi }; public: // functions - static std::shared_ptr openSinglePieceOrSplitZimFile(const std::string& filename); + static std::shared_ptr openSinglePieceOrSplitZimFile(std::string filename); explicit FileCompound(const std::string& filename); explicit FileCompound(const std::string& filename, MultiPartToken token); diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 2341febd0..91132f899 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -207,6 +207,15 @@ class Grouping throw ZimFileFormatError("error reading zim-file header."); } + // This can happen for several reasons: + // - Zim file is corrupted (corrupted header) + // - Zim file is too small (ongoing download, truncated file...) + // - Zim file is embedded at beginning of another file (and we try to open the file as a zim file) + // If open through a FdInput, size should be set in FdInput. + if (header.hasChecksum() && (header.getChecksumPos() + 16) != size_type(zimReader->size())) { + throw ZimFileFormatError("Zim file(s) is of bad size or corrupted."); + } + auto pathPtrReader = sectionSubReader(*zimReader, "Dirent pointer table", offset_t(header.getPathPtrPos()), @@ -297,10 +306,6 @@ class Grouping throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); } } - - if (header.hasChecksum() && header.getChecksumPos() != (getFilesize().v-16) ) { - throw ZimFileFormatError("Checksum position is not valid"); - } } offset_type FileImpl::getMimeListEndUpperLimit() const diff --git a/test/archive.cpp b/test/archive.cpp index c7143fad1..c0241a2fa 100644 --- a/test/archive.cpp +++ b/test/archive.cpp @@ -264,6 +264,31 @@ TEST(ZimArchive, openRealZimArchive) } } +TEST(ZimArchive, openSplitZimArchive) +{ + const char* fname = "wikibooks_be_all_nopic_2017-02_splitted.zim"; + + for (auto& testfile: getDataFilePath(fname)) { + const TestContext ctx{ {"path", testfile.path+"aa" } }; + std::unique_ptr archive; + EXPECT_NO_THROW( archive.reset(new zim::Archive(testfile.path+"aa")) ) << ctx; + if ( archive ) { + EXPECT_TRUE( archive->check() ) << ctx; + } + } +} + +TEST(ZimArchive, openDontFallbackOnNonSplitZimArchive) +{ + const char* fname = "wikibooks_be_all_nopic_2017-02.zim"; + + for (auto& testfile: getDataFilePath(fname)) { + const TestContext ctx{ {"path", testfile.path+"aa" } }; + std::unique_ptr archive; + EXPECT_THROW( archive.reset(new zim::Archive(testfile.path+"aa")), std::runtime_error) << ctx; + } +} + TEST(ZimArchive, randomEntry) { const char* const zimfiles[] = { @@ -434,7 +459,7 @@ TEST(ZimArchive, validate) TEST_BROKEN_ZIM_NAME( "invalid.invalid_checksumpos.zim", - "Checksum position is not valid\n" + "Zim file(s) is of bad size or corrupted.\n" ); TEST_BROKEN_ZIM_NAME(