pdf-parser.py

#!/usr/bin/env python

__description__ = 'pdf-parser, use it to parse a PDF document'
__author__ = 'Didier Stevens'
__version__ = '0.7.10'
__date__ = '2024/10/26'
__minimum_python_version__ = (2, 5, 1)
__maximum_python_version__ = (3, 11, 1)

"""
Source code put in public domain by Didier Stevens, no Copyright
https://DidierStevens.com
Use at your own risk

History:
  2008/05/02: continue
  2008/05/03: continue
  2008/06/02: streams
  2008/10/19: refactor, grep & extract functionality
  2008/10/20: reference
  2008/10/21: cleanup
  2008/11/12: V0.3 dictionary parser
  2008/11/13: option elements
  2008/11/14: continue
  2009/05/05: added /ASCIIHexDecode support (thanks Justin Prosco)
  2009/05/11: V0.3.1 updated usage, added --verbose and --extract
  2009/07/16: V0.3.2 Added Canonicalize (thanks Justin Prosco)
  2009/07/18: bugfix EqualCanonical
  2009/07/24: V0.3.3 Added --hash option
  2009/07/25: EqualCanonical for option --type, added option --nocanonicalizedoutput
  2009/07/28: V0.3.4 Added ASCII85Decode support
  2009/08/01: V0.3.5 Updated ASCIIHexDecode to support whitespace obfuscation
  2009/08/30: V0.3.6 TestPythonVersion
  2010/01/08: V0.3.7 Added RLE and LZW support (thanks pARODY); added dump option
  2010/01/09: Fixed parsing of incomplete startxref
  2010/09/22: V0.3.8 Changed dump option, updated PrettyPrint, added debug option
  2011/12/17: fixed bugs empty objects
  2012/03/11: V0.3.9 fixed bugs double nested [] in PrettyPrintSub (thanks kurt)
  2013/01/11: V0.3.10 Extract and dump bug fixes by Priit; added content option
  2013/02/16: Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas; xrange replaced with range
  2013/02/16: V0.4.0 added http/https support; added error handling for missing file or URL; ; added support for ZIP file with password 'infected'
  2013/03/13: V0.4.1 fixes for Python 3
  2013/04/11: V0.4.2 modified PrettyPrintSub for strings with unprintable characters
  2013/05/04: Added options searchstream, unfiltered, casesensitive, regex
  2013/09/18: V0.4.3 fixed regression bug -w option
  2014/09/25: V0.5.0 added option -g
  2014/09/29: Added PrintGenerateObject and PrintOutputObject
  2014/12/05: V0.6.0 Added YARA support
  2014/12/09: cleanup, refactoring
  2014/12/13: Python 3 fixes
  2015/01/11: Added support for multiple YARA rule files; added request to search in trailer
  2015/01/31: V0.6.1 Added optionyarastrings
  2015/02/09: Added decoders
  2015/04/05: V0.6.2 Added generateembedded
  2015/04/06: fixed bug reported by Kurt for stream produced by Ghostscript where endstream is not preceded by whitespace; fixed prettyprint bug
  2015/04/24: V0.6.3 when option dump's filename is -, content is dumped to stdout
  2015/08/12: V0.6.4 option hash now also calculates hashes of streams when selecting or searching objects; and displays hexasciidump first line
  2016/07/27: V0.6.5 bugfix whitespace 0x00 0x0C after stream 0x0D 0x0A reported by @mr_me
  2016/11/20: V0.6.6 added workaround zlib errors FlateDecode
  2016/12/17: V0.6.7 added option -k
  2017/01/07: V0.6.8 changed cPDFParseDictionary to handle strings () with % character
  2017/10/28: fixed bug
  2017/10/29: added # support for option -y
  2018/06/29: V0.6.9 added option --overridingfilters
  2018/10/20: added keywords to statistics
  2019/02/22: V0.7.0 added option -O --objstm to parse the stream of /ObjStm objects, inspired by a contributor wishing anonymity
  2019/03/01: V0.7.1 added ContainsName for correct keyword statistics (-a)
  2019/04/12: V0.7.2 Python 2.6.6 compatibility fix
  2019/07/30: bug fixes (including fixes Josef Hinteregger)
  2019/09/26: V0.7.3 added multiple id selection to option -o; added man page (-m); added environment variable PDFPARSER_OPTIONS; bug fixes
  2019/11/05: V0.7.4 fixed plugin path when compiled with pyinstaller, replaced eval with int
  2021/07/03: V0.7.5 bug fixes; fixed ASCII85Decode Python 3 bug thanks to R Primus
  2021/11/23: V0.7.6 Python 3 bug fixes
  2022/05/24: bug fixes
  2022/11/09: V0.7.7 added support for environment variable DSS_DEFAULT_HASH_ALGORITHMS
  2023/01/03: V0.7.8 added unreferenced objects to statistics
  2024/03/21: V0.7.9 added option jsonoutput; added verbose YARA rules
  2024/10/25: V0.7.10 /ObjStm fix (x9090 PR)
  2024/10/26: added pyzipper support

Todo:
  - handle printf todo
  - support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D.pdf.vir

"""

import re
import optparse
import zlib
import binascii
import hashlib
import sys
import time
import os
import textwrap
import json
if sys.version_info[0] >= 3:
    from io import StringIO
    import urllib.request
    urllib23 = urllib.request
    import configparser as ConfigParser
else:
    from cStringIO import StringIO
    import urllib2
    urllib23 = urllib2
    import ConfigParser
try:
    import yara
except:
    pass
try:
    import pyzipper as zipfile
except ImportError:
    import zipfile

CHAR_WHITESPACE = 1
CHAR_DELIMITER = 2
CHAR_REGULAR = 3

CONTEXT_NONE = 1
CONTEXT_OBJ = 2
CONTEXT_XREF = 3
CONTEXT_TRAILER = 4

PDF_ELEMENT_COMMENT = 1
PDF_ELEMENT_INDIRECT_OBJECT = 2
PDF_ELEMENT_XREF = 3
PDF_ELEMENT_TRAILER = 4
PDF_ELEMENT_STARTXREF = 5
PDF_ELEMENT_MALFORMED = 6

dumplinelength = 16

def PrintManual():
    manual = '''
Manual:

This manual is a work in progress.

There is a free PDF analysis book:
https://blog.didierstevens.com/2010/09/26/free-malicious-pdf-analysis-e-book/

Option -o is used to select objects by id. Provide a single id or multiple ids separated by a comma (,).

When environment variable PDFPARSER_OPTIONS is defined, the options it defines are added implicitely to the command line arguments.
Use this to define options you want included with each use of pdf-parser.py.
Like option -O, to parse stream objects (/ObjStm).
By defining PDFPARSER_OPTIONS=-O, pdf-parser will always parse stream objects (when found).
PS: this feature is experimental.

Option -H calculates the MD5 hash by default.
This can be changed by setting environment variable DSS_DEFAULT_HASH_ALGORITHMS.
Like this: set DSS_DEFAULT_HASH_ALGORITHMS=sha256

Option --jsonoutput produces JSON output with the stream content of all objects with streams. Options -f and --overridingfilters apply.
For example, if option -f is used, the JSON output contains the filtered streams, otherwise the JSON output contains the unfiltered streams.

'''
    for line in manual.split('\n'):
        print(textwrap.fill(line))

#Convert 2 Bytes If Python 3
def C2BIP3(string):
    if sys.version_info[0] > 2:
        if type(string) == bytes:
            return string
        else:
            return bytes([ord(x) for x in string])
    else:
        return string

#Convert 2 String If Python 3
def C2SIP3(bytes):
    if sys.version_info[0] > 2:
        return ''.join([chr(byte) for byte in bytes])
    else:
        return bytes

# CIC: Call If Callable
def CIC(expression):
    if callable(expression):
        return expression()
    else:
        return expression

# IFF: IF Function
def IFF(expression, valueTrue, valueFalse):
    if expression:
        return CIC(valueTrue)
    else:
        return CIC(valueFalse)

def Timestamp(epoch=None):
    if epoch == None:
        localTime = time.localtime()
    else:
        localTime = time.localtime(epoch)
    return '%04d%02d%02d-%02d%02d%02d' % localTime[0:6]

def CopyWithoutWhiteSpace(content):
    result = []
    for token in content:
        if token[0] != CHAR_WHITESPACE:
            result.append(token)
    return result

def Obj2Str(content):
    return ''.join(map(lambda x: repr(x[1])[1:-1], CopyWithoutWhiteSpace(content)))

def CreateZipFileObject(arg1, arg2):
    if 'AESZipFile' in dir(zipfile):
        return zipfile.AESZipFile(arg1, arg2)
    else:
        return zipfile.ZipFile(arg1, arg2)

class cPDFDocument:
    def __init__(self, file):
        self.file = file
        if type(file) != str:
            self.infile = file
        elif file.lower().startswith('http://') or file.lower().startswith('https://'):
            try:
                if sys.hexversion >= 0x020601F0:
                    self.infile = urllib23.urlopen(file, timeout=5)
                else:
                    self.infile = urllib23.urlopen(file)
            except urllib23.HTTPError:
                print('Error accessing URL %s' % file)
                print(sys.exc_info()[1])
                sys.exit()
        elif file.lower().endswith('.zip'):
            try:
                self.zipfile = CreateZipFileObject(file, 'r')
                self.infile = self.zipfile.open(self.zipfile.infolist()[0], 'r', C2BIP3('infected'))
            except:
                print('Error opening file %s' % file)
                print(sys.exc_info()[1])
                sys.exit()
        else:
            try:
                self.infile = open(file, 'rb')
            except:
                print('Error opening file %s' % file)
                print(sys.exc_info()[1])
                sys.exit()
        self.ungetted = []
        self.position = -1

    def byte(self):
        if len(self.ungetted) != 0:
            self.position += 1
            return self.ungetted.pop()
        inbyte = self.infile.read(1)
        if not inbyte or inbyte == '':
            self.infile.close()
            return None
        self.position += 1
        return ord(inbyte)

    def unget(self, byte):
        self.position -= 1
        self.ungetted.append(byte)

def CharacterClass(byte):
    if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32:
        return CHAR_WHITESPACE
    if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25:
        return CHAR_DELIMITER
    return CHAR_REGULAR

def IsNumeric(str):
    return re.match('^[0-9]+', str)

class cPDFTokenizer:
    def __init__(self, file):
        self.oPDF = cPDFDocument(file)
        self.ungetted = []

    def Token(self):
        if len(self.ungetted) != 0:
            return self.ungetted.pop()
        if self.oPDF == None:
            return None
        self.byte = self.oPDF.byte()
        if self.byte == None:
            self.oPDF = None
            return None
        elif CharacterClass(self.byte) == CHAR_WHITESPACE:
            file_str = StringIO()
            while self.byte != None and CharacterClass(self.byte) == CHAR_WHITESPACE:
                file_str.write(chr(self.byte))
                self.byte = self.oPDF.byte()
            if self.byte != None:
                self.oPDF.unget(self.byte)
            else:
                self.oPDF = None
            self.token = file_str.getvalue()
            return (CHAR_WHITESPACE, self.token)
        elif CharacterClass(self.byte) == CHAR_REGULAR:
            file_str = StringIO()
            while self.byte != None and CharacterClass(self.byte) == CHAR_REGULAR:
                file_str.write(chr(self.byte))
                self.byte = self.oPDF.byte()
            if self.byte != None:
                self.oPDF.unget(self.byte)
            else:
                self.oPDF = None
            self.token = file_str.getvalue()
            return (CHAR_REGULAR, self.token)
        else:
            if self.byte == 0x3C:
                self.byte = self.oPDF.byte()
                if self.byte == 0x3C:
                    return (CHAR_DELIMITER, '<<')
                else:
                    self.oPDF.unget(self.byte)
                    return (CHAR_DELIMITER, '<')
            elif self.byte == 0x3E:
                self.byte = self.oPDF.byte()
                if self.byte == 0x3E:
                    return (CHAR_DELIMITER, '>>')
                else:
                    self.oPDF.unget(self.byte)
                    return (CHAR_DELIMITER, '>')
            elif self.byte == 0x25:
                file_str = StringIO()
                while self.byte != None:
                    file_str.write(chr(self.byte))
                    if self.byte == 10 or self.byte == 13:
                        self.byte = self.oPDF.byte()
                        break
                    self.byte = self.oPDF.byte()
                if self.byte != None:
                    if self.byte == 10:
                        file_str.write(chr(self.byte))
                    else:
                        self.oPDF.unget(self.byte)
                else:
                    self.oPDF = None
                self.token = file_str.getvalue()
                return (CHAR_DELIMITER, self.token)
            return (CHAR_DELIMITER, chr(self.byte))

    def TokenIgnoreWhiteSpace(self):
        token = self.Token()
        while token != None and token[0] == CHAR_WHITESPACE:
            token = self.Token()
        return token

    def Tokens(self):
        tokens = []
        token = self.Token()
        while token != None:
            tokens.append(token)
            token = self.Token()
        return tokens

    def unget(self, byte):
        self.ungetted.append(byte)

class cPDFParser:
    def __init__(self, file, verbose=False, extract=None, objstm=None):
        self.context = CONTEXT_NONE
        self.content = []
        self.oPDFTokenizer = cPDFTokenizer(file)
        self.verbose = verbose
        self.extract = extract
        self.objstm = objstm

    def GetObject(self):
        while True:
            if self.context == CONTEXT_OBJ:
                self.token = self.oPDFTokenizer.Token()
            else:
                self.token = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
            if self.token:
                if self.token[0] == CHAR_DELIMITER:
                    if self.token[1][0] == '%':
                        if self.context == CONTEXT_OBJ:
                            self.content.append(self.token)
                        else:
                            return cPDFElementComment(self.token[1])
                    elif self.token[1] == '/':
                        self.token2 = self.oPDFTokenizer.Token()
                        if self.token2[0] == CHAR_REGULAR:
                            if self.context != CONTEXT_NONE:
                                self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
                            elif self.verbose:
                                print('todo 1: %s' % (self.token[1] + self.token2[1]))
                        else:
                            self.oPDFTokenizer.unget(self.token2)
                            if self.context != CONTEXT_NONE:
                                self.content.append(self.token)
                            elif self.verbose:
                                print('todo 2: %d %s' % (self.token[0], repr(self.token[1])))
                    elif self.context != CONTEXT_NONE:
                        self.content.append(self.token)
                    elif self.verbose:
                        print('todo 3: %d %s' % (self.token[0], repr(self.token[1])))
                elif self.token[0] == CHAR_WHITESPACE:
                    if self.context != CONTEXT_NONE:
                        self.content.append(self.token)
                    elif self.verbose:
                        print('todo 4: %d %s' % (self.token[0], repr(self.token[1])))
                else:
                    if self.context == CONTEXT_OBJ:
                        if self.token[1] == 'endobj':
                            self.oPDFElementIndirectObject = cPDFElementIndirectObject(self.objectId, self.objectVersion, self.content, self.objstm)
                            self.context = CONTEXT_NONE
                            self.content = []
                            return self.oPDFElementIndirectObject
                        else:
                            self.content.append(self.token)
                    elif self.context == CONTEXT_TRAILER:
                        if self.token[1] == 'startxref' or self.token[1] == 'xref':
                            self.oPDFElementTrailer = cPDFElementTrailer(self.content)
                            self.oPDFTokenizer.unget(self.token)
                            self.context = CONTEXT_NONE
                            self.content = []
                            return self.oPDFElementTrailer
                        else:
                            self.content.append(self.token)
                    elif self.context == CONTEXT_XREF:
                        if self.token[1] == 'trailer' or self.token[1] == 'xref':
                            self.oPDFElementXref = cPDFElementXref(self.content)
                            self.oPDFTokenizer.unget(self.token)
                            self.context = CONTEXT_NONE
                            self.content = []
                            return self.oPDFElementXref
                        else:
                            self.content.append(self.token)
                    else:
                        if IsNumeric(self.token[1]):
                            self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
                            if IsNumeric(self.token2[1]):
                                self.token3 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
                                if self.token3[1] == 'obj':
                                    self.objectId = int(self.token[1], 10)
                                    self.objectVersion = int(self.token2[1], 10)
                                    self.context = CONTEXT_OBJ
                                else:
                                    self.oPDFTokenizer.unget(self.token3)
                                    self.oPDFTokenizer.unget(self.token2)
                                    if self.verbose:
                                        print('todo 6: %d %s' % (self.token[0], repr(self.token[1])))
                            else:
                                self.oPDFTokenizer.unget(self.token2)
                                if self.verbose:
                                    print('todo 7: %d %s' % (self.token[0], repr(self.token[1])))
                        elif self.token[1] == 'trailer':
                            self.context = CONTEXT_TRAILER
                            self.content = [self.token]
                        elif self.token[1] == 'xref':
                            self.context = CONTEXT_XREF
                            self.content = [self.token]
                        elif self.token[1] == 'startxref':
                            self.token2 = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
                            if self.token2 and IsNumeric(self.token2[1]):
                                return cPDFElementStartxref(int(self.token2[1], 10))
                            else:
                                self.oPDFTokenizer.unget(self.token2)
                                if self.verbose:
                                    print('todo 9: %d %s' % (self.token[0], repr(self.token[1])))
                        elif self.extract:
                            self.bytes = ''
                            while self.token:
                                self.bytes += self.token[1]
                                self.token = self.oPDFTokenizer.Token()
                            return cPDFElementMalformed(self.bytes)
                        elif self.verbose:
                            print('todo 10: %d %s' % (self.token[0], repr(self.token[1])))
            else:
                break

class cPDFElementComment:
    def __init__(self, comment):
        self.type = PDF_ELEMENT_COMMENT
        self.comment = comment
#                        if re.match('^%PDF-[0-9]\.[0-9]', self.token[1]):
#                            print(repr(self.token[1]))
#                        elif re.match('^%%EOF', self.token[1]):
#                            print(repr(self.token[1]))

class cPDFElementXref:
    def __init__(self, content):
        self.type = PDF_ELEMENT_XREF
        self.content = content

class cPDFElementTrailer:
    def __init__(self, content):
        self.type = PDF_ELEMENT_TRAILER
        self.content = content

    def Contains(self, keyword):
        data = ''
        for i in range(0, len(self.content)):
            if self.content[i][1] == 'stream':
                break
            else:
                data += Canonicalize(self.content[i][1])
        return data.upper().find(keyword.upper()) != -1

def IIf(expr, truepart, falsepart):
    if expr:
        return truepart
    else:
        return falsepart

class cPDFElementIndirectObject:
    def __init__(self, id, version, content, objstm=None):
        self.type = PDF_ELEMENT_INDIRECT_OBJECT
        self.id = id
        self.version = version
        self.content = content
        self.objstm = objstm
        #fix stream for Ghostscript bug reported by Kurt
        if self.ContainsStream():
            position = len(self.content) - 1
            if position < 0:
                return
            while self.content[position][0] == CHAR_WHITESPACE and position >= 0:
                position -= 1
            if position < 0:
                return
            if self.content[position][1].endswith('endstream\n'):
                self.content = self.content[0:position] + [(self.content[position][0], self.content[position][1][:-len('endstream\n')])] + [(CHAR_REGULAR, 'endstream')] + self.content[position+1:]
                return
            if self.content[position][0] != CHAR_REGULAR:
                return
            if self.content[position][1] == 'endstream':
                return
            if not self.content[position][1].endswith('endstream'):
                return
            self.content = self.content[0:position] + [(self.content[position][0], self.content[position][1][:-len('endstream')])] + [(self.content[position][0], 'endstream')] + self.content[position+1:]

    def GetType(self):
        content = CopyWithoutWhiteSpace(self.content)
        dictionary = 0
        for i in range(0, len(content)):
            if content[i][0] == CHAR_DELIMITER and content[i][1] == '<<':
                dictionary += 1
            if content[i][0] == CHAR_DELIMITER and content[i][1] == '>>':
                dictionary -= 1
            if dictionary == 1 and content[i][0] == CHAR_DELIMITER and EqualCanonical(content[i][1], '/Type') and i < len(content) - 1:
                return content[i+1][1]
        return ''

    def GetReferences(self):
        content = CopyWithoutWhiteSpace(self.content)
        references = []
        for i in range(0, len(content)):
            if i > 1 and content[i][0] == CHAR_REGULAR and content[i][1] == 'R' and content[i-2][0] == CHAR_REGULAR and IsNumeric(content[i-2][1]) and content[i-1][0] == CHAR_REGULAR and IsNumeric(content[i-1][1]):
                references.append((content[i-2][1], content[i-1][1], content[i][1]))
        return references

    def References(self, index):
        for ref in self.GetReferences():
            if ref[0] == index:
                return True
        return False

    def ContainsStream(self):
        for i in range(0, len(self.content)):
            if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
                return self.content[0:i]
        return False

    def Contains(self, keyword):
        data = ''
        for i in range(0, len(self.content)):
            if self.content[i][1] == 'stream':
                break
            else:
                data += Canonicalize(self.content[i][1])
        return data.upper().find(keyword.upper()) != -1

    def ContainsName(self, keyword):
        for token in self.content:
            if token[1] == 'stream':
                return False
            if token[0] == CHAR_DELIMITER and EqualCanonical(token[1], keyword):
                return True
        return False

    def StreamContains(self, keyword, filter, casesensitive, regex, overridingfilters):
        if not self.ContainsStream():
            return False
        streamData = self.Stream(filter, overridingfilters)
        if filter and streamData == 'No filters':
            streamData = self.Stream(False, overridingfilters)
        if isinstance(streamData, bytes):
            keyword = keyword.encode()
        if regex:
            return re.search(keyword, streamData, IIf(casesensitive, 0, re.I))
        elif casesensitive:
            return keyword in streamData
        else:
            return keyword.lower() in streamData.lower()

    def Stream(self, filter=True, overridingfilters=''):
        state = 'start'
        countDirectories = 0
        data = ''
        filters = []
        for i in range(0, len(self.content)):
            if state == 'start':
                if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '<<':
                    countDirectories += 1
                if self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '>>':
                    countDirectories -= 1
                if countDirectories == 1 and self.content[i][0] == CHAR_DELIMITER and EqualCanonical(self.content[i][1], '/Filter'):
                    state = 'filter'
                elif countDirectories == 0 and self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
                    state = 'stream-whitespace'
            elif state == 'filter':
                if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/':
                    filters = [self.content[i][1]]
                    state = 'search-stream'
                elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == '[':
                    state = 'filter-list'
            elif state == 'filter-list':
                if self.content[i][0] == CHAR_DELIMITER and self.content[i][1][0] == '/':
                    filters.append(self.content[i][1])
                elif self.content[i][0] == CHAR_DELIMITER and self.content[i][1] == ']':
                    state = 'search-stream'
            elif state == 'search-stream':
                if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'stream':
                    state = 'stream-whitespace'
            elif state == 'stream-whitespace':
                if self.content[i][0] == CHAR_WHITESPACE:
                    whitespace = self.content[i][1]
                    if whitespace.startswith('\x0D\x0A') and len(whitespace) > 2:
                        data += whitespace[2:]
                    elif whitespace.startswith('\x0A') and len(whitespace) > 1:
                        data += whitespace[1:]
                else:
                    data += self.content[i][1]
                state = 'stream-concat'
            elif state == 'stream-concat':
                if self.content[i][0] == CHAR_REGULAR and self.content[i][1] == 'endstream':
                    if filter:
                        if overridingfilters == '':
                            return self.Decompress(data, filters)
                        elif overridingfilters == 'raw':
                            return data
                        else:
                            return self.Decompress(data, overridingfilters.split(' '))
                    else:
                        return data
                else:
                    data += self.content[i][1]
            else:
                return 'Unexpected filter state'
        return filters

    def Decompress(self, data, filters):
        for filter in filters:
            if EqualCanonical(filter, '/FlateDecode') or EqualCanonical(filter, '/Fl'):
                try:
                    data = FlateDecode(data)
                except zlib.error as e:
                    message = 'FlateDecode decompress failed'
                    if len(data) > 0 and ord(data[0]) & 0x0F != 8:
                        message += ', unexpected compression method: %02x' % ord(data[0])
                    return message + '. zlib.error %s' % e.message
            elif EqualCanonical(filter, '/ASCIIHexDecode') or EqualCanonical(filter, '/AHx'):
                try:
                    data = ASCIIHexDecode(data)
                except:
                    return 'ASCIIHexDecode decompress failed'
            elif EqualCanonical(filter, '/ASCII85Decode') or EqualCanonical(filter, '/A85'):
                try:
                    data = ASCII85Decode(data.rstrip('>'))
                except:
                    return 'ASCII85Decode decompress failed'
            elif EqualCanonical(filter, '/LZWDecode') or EqualCanonical(filter, '/LZW'):
                try:
                    data = LZWDecode(data)
                except:
                    return 'LZWDecode decompress failed'
            elif EqualCanonical(filter, '/RunLengthDecode') or EqualCanonical(filter, '/R'):
                try:
                    data = RunLengthDecode(data)
                except:
                    return 'RunLengthDecode decompress failed'
#            elif i.startswith('/CC')                        # CCITTFaxDecode
#            elif i.startswith('/DCT')                       # DCTDecode
            else:
                return 'Unsupported filter: %s' % repr(filters)
        if len(filters) == 0:
            return 'No filters'
        else:
            return data

    def StreamYARAMatch(self, rules, decoders, decoderoptions, filter, overridingfilters):
        if not self.ContainsStream():
            return None
        streamData = self.Stream(filter, overridingfilters)
        if filter and streamData == 'No filters':
            streamData = self.Stream(False, overridingfilters)

        oDecoders = [cIdentity(streamData, None)]
        for cDecoder in decoders:
            try:
                oDecoder = cDecoder(streamData, decoderoptions)
                oDecoders.append(oDecoder)
            except Exception as e:
                print('Error instantiating decoder: %s' % cDecoder.name)
                raise e
        results = []
        for oDecoder in oDecoders:
            while oDecoder.Available():
                yaraResults = rules.match(data=oDecoder.Decode())
                if yaraResults != []:
                    results.append([oDecoder.Name(), yaraResults])

        return results

class cPDFElementStartxref:
    def __init__(self, index):
        self.type = PDF_ELEMENT_STARTXREF
        self.index = index

class cPDFElementMalformed:
    def __init__(self, content):
        self.type = PDF_ELEMENT_MALFORMED
        self.content = content

def TrimLWhiteSpace(data):
    while data != [] and data[0][0] == CHAR_WHITESPACE:
        data = data[1:]
    return data

def TrimRWhiteSpace(data):
    while data != [] and data[-1][0] == CHAR_WHITESPACE:
        data = data[:-1]
    return data

class cPDFParseDictionary:
    def __init__(self, content, nocanonicalizedoutput):
        self.content = content
        self.nocanonicalizedoutput = nocanonicalizedoutput
        dataTrimmed = TrimLWhiteSpace(TrimRWhiteSpace(self.content))
        if dataTrimmed == []:
            self.parsed = None
        elif self.isOpenDictionary(dataTrimmed[0]) and (self.isCloseDictionary(dataTrimmed[-1]) or self.couldBeCloseDictionary(dataTrimmed[-1])):
            self.parsed = self.ParseDictionary(dataTrimmed)[0]
        else:
            self.parsed = None

    def isOpenDictionary(self, token):
        return token[0] == CHAR_DELIMITER and token[1] == '<<'

    def isCloseDictionary(self, token):
        return token[0] == CHAR_DELIMITER and token[1] == '>>'

    def couldBeCloseDictionary(self, token):
        return token[0] == CHAR_DELIMITER and token[1].rstrip().endswith('>>')

    def ParseDictionary(self, tokens):
        state = 0 # start
        dictionary = []
        while tokens != []:
            if state == 0:
                if self.isOpenDictionary(tokens[0]):
                    state = 1
                else:
                    return None, tokens
            elif state == 1:
                if self.isOpenDictionary(tokens[0]):
                    pass
                elif self.isCloseDictionary(tokens[0]):
                    return dictionary, tokens
                elif tokens[0][0] != CHAR_WHITESPACE:
                    key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)
                    value = []
                    state = 2
            elif state == 2:
                if self.isOpenDictionary(tokens[0]):
                    value, tokens = self.ParseDictionary(tokens)
                    dictionary.append((key, value))
                    state = 1
                elif self.isCloseDictionary(tokens[0]):
                    dictionary.append((key, value))
                    return dictionary, tokens
                elif value == [] and tokens[0][0] == CHAR_WHITESPACE:
                    pass
                elif value == [] and tokens[0][1] == '[':
                    value.append(tokens[0][1])
                elif value != [] and value[0] == '[' and tokens[0][1] != ']':
                    value.append(tokens[0][1])
                elif value != [] and value[0] == '[' and tokens[0][1] == ']':
                    value.append(tokens[0][1])
                    dictionary.append((key, value))
                    value = []
                    state = 1
                elif value == [] and tokens[0][1] == '(':
                    value.append(tokens[0][1])
                elif value != [] and value[0] == '(' and tokens[0][1] != ')':
                    if tokens[0][1][0] == '%':
                        tokens = [tokens[0]] + cPDFTokenizer(StringIO(tokens[0][1][1:])).Tokens() + tokens[1:]
                        value.append('%')
                    else:
                        value.append(tokens[0][1])
                elif value != [] and value[0] == '(' and tokens[0][1] == ')':
                    value.append(tokens[0][1])
                    balanced = 0
                    for item in value:
                        if item == '(':
                            balanced += 1
                        elif item == ')':
                            balanced -= 1
                    if balanced < 0 and self.verbose:
                        print('todo 11: ' + repr(value))
                    if balanced < 1:
                        dictionary.append((key, value))
                        value = []
                        state = 1
                elif value != [] and tokens[0][1][0] == '/':
                    dictionary.append((key, value))
                    key = ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput)
                    value = []
                    state = 2
                else:
                    value.append(ConditionalCanonicalize(tokens[0][1], self.nocanonicalizedoutput))
            tokens = tokens[1:]
        return None, tokens

    def Retrieve(self):
        return self.parsed

    def PrettyPrintSubElement(self, prefix, e):
        if e[1] == []:
            print('%s  %s' % (prefix, e[0]))
        elif type(e[1][0]) == type(''):
            if len(e[1]) == 3 and IsNumeric(e[1][0]) and e[1][1] == '0' and e[1][2] == 'R':
                joiner = ' '
            else:
                joiner = ''
            value = joiner.join(e[1]).strip()
            reprValue = repr(value)
            if "'" + value + "'" != reprValue:
                value = reprValue
            print('%s  %s %s' % (prefix, e[0], value))
        else:
            print('%s  %s' % (prefix, e[0]))
            self.PrettyPrintSub(prefix + '    ', e[1])

    def PrettyPrintSub(self, prefix, dictionary):
        if dictionary != None:
            print('%s<<' % prefix)
            for e in dictionary:
                self.PrettyPrintSubElement(prefix, e)
            print('%s>>' % prefix)

    def PrettyPrint(self, prefix):
        self.PrettyPrintSub(prefix, self.parsed)

    def Get(self, select):
        for key, value in self.parsed:
            if key == select:
                return value
        return None

    def GetNestedSub(self, dictionary, select):
        for key, value in dictionary:
            if key == select:
                return self.PrettyPrintSubElement('', [select, value])
            if type(value) == type([]) and len(value) > 0 and type(value[0]) == type((None,)):
                result = self.GetNestedSub(value, select)
                if result !=None:
                    return self.PrettyPrintSubElement('', [select, result])
        return None

    def GetNested(self, select):
        return self.GetNestedSub(self.parsed, select)

def FormatOutput(data, raw):
    if raw:
        if type(data) == type([]):
            return ''.join(map(lambda x: x[1], data))
        else:
            return data
    elif sys.version_info[0] > 2:
        return ascii(data)
    else:
        return repr(data)

#Fix for http://bugs.python.org/issue11395
def StdoutWriteChunked(data):
    if sys.version_info[0] > 2:
        sys.stdout.buffer.write(data)
    else:
        while data != '':
            sys.stdout.write(data[0:10000])
            try:
                sys.stdout.flush()
            except IOError:
                return
            data = data[10000:]

def IfWIN32SetBinary(io):
    if sys.platform == 'win32':
        import msvcrt
        msvcrt.setmode(io.fileno(), os.O_BINARY)

def PrintOutputObject(object, options):
    if options.dump == '-':
        filtered = object.Stream(options.filter == True, options.overridingfilters)
        if filtered == []:
            filtered = ''
        IfWIN32SetBinary(sys.stdout)
        StdoutWriteChunked(filtered)
        return

    print('obj %d %d' % (object.id, object.version))
    if object.objstm != None:
        print(' Containing /ObjStm: %d %d' % object.objstm)
    print(' Type: %s' % ConditionalCanonicalize(object.GetType(), options.nocanonicalizedoutput))
    print(' Referencing: %s' % ', '.join(map(lambda x: '%s %s %s' % x, object.GetReferences())))
    dataPrecedingStream = object.ContainsStream()
    oPDFParseDictionary = None
    if dataPrecedingStream:
        print(' Contains stream')
        if options.debug:
            print(' %s' % FormatOutput(dataPrecedingStream, options.raw))
        oPDFParseDictionary = cPDFParseDictionary(dataPrecedingStream, options.nocanonicalizedoutput)
        if options.hash:
            streamContent = object.Stream(False, options.overridingfilters)
            print('  unfiltered')
            print('   len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest()))
            print('   %s' % HexAsciiDumpLine(streamContent))
            streamContent = object.Stream(True, options.overridingfilters)
            print('  filtered')
            print('   len: %6d md5: %s' % (len(streamContent), hashlib.md5(streamContent).hexdigest()))
            print('   %s' % HexAsciiDumpLine(streamContent))
            streamContent = None
    else:
        if options.debug or options.raw:
            print(' %s' % FormatOutput(object.content, options.raw))
        oPDFParseDictionary = cPDFParseDictionary(object.content, options.nocanonicalizedoutput)
    print('')
    oPDFParseDictionary.PrettyPrint('  ')
    print('')
    if options.filter and not options.dump:
        filtered = object.Stream(overridingfilters=options.overridingfilters)
        if filtered == []:
            print(' %s' % FormatOutput(object.content, options.raw))
        else:
            print(' %s' % FormatOutput(filtered, options.raw))
    if options.content:
        if object.ContainsStream():
            stream = object.Stream(False, options.overridingfilters)
            if stream != []:
                print(' %s' % FormatOutput(stream, options.raw))
        else:
            print(''.join([token[1] for token in object.content]))


    if options.dump:
        filtered = object.Stream(options.filter == True, options.overridingfilters)
        if filtered == []:
            filtered = ''
        try:
            fDump = open(options.dump, 'wb')
            try:
                fDump.write(C2BIP3(filtered))
            except:
                print('Error writing file %s' % options.dump)
            fDump.close()
        except:
            print('Error writing file %s' % options.dump)
    print('')
    return

def Canonicalize(sIn):
    if sIn == '':
        return sIn
    elif sIn[0] != '/':
        return sIn
    elif sIn.find('#') == -1:
        return sIn
    else:
        i = 0
        iLen = len(sIn)
        sCanonical = ''
        while i < iLen:
            if sIn[i] == '#' and i < iLen - 2:
                try:
                    sCanonical += chr(int(sIn[i+1:i+3], 16))
                    i += 2
                except:
                    sCanonical += sIn[i]
            else:
                sCanonical += sIn[i]
            i += 1
        return sCanonical

def EqualCanonical(s1, s2):
    return Canonicalize(s1) == s2

def ConditionalCanonicalize(sIn, nocanonicalizedoutput):
    if nocanonicalizedoutput:
        return sIn
    else:
        return Canonicalize(sIn)

# http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer/pdfminer/ascii85.py
def ASCII85Decode(data):
  import struct
  n = b = 0
  out = b''
  for c in data:
    if '!' <= c and c <= 'u':
      n += 1
      b = b*85+(ord(c)-33)
      if n == 5:
        out += struct.pack('>L',b)
        n = b = 0
    elif c == 'z':
      assert n == 0
      out += b'\0\0\0\0'
    elif c == '~':
      if n:
        for _ in range(5-n):
          b = b*85+84
        out += struct.pack('>L',b)[:n-1]
      break
  return out

def ASCIIHexDecode(data):
    return binascii.unhexlify(''.join([c for c in data if c not in ' \t\n\r']).rstrip('>'))

# if inflating fails, we try to inflate byte per byte (sample 4da299d6e52bbb79c0ac00bad6a1d51d4d5fe42965a8d94e88a359e5277117e2)
def FlateDecode(data):
    try:
        return zlib.decompress(C2BIP3(data))
    except:
        if len(data) <= 10:
            raise
        oDecompress = zlib.decompressobj()
        oStringIO = StringIO()
        count = 0
        for byte in C2BIP3(data):
            try:
                oStringIO.write(oDecompress.decompress(byte))
                count += 1
            except:
                break
        if len(data) - count <= 2:
            return oStringIO.getvalue()
        else:
            raise

def RunLengthDecode(data):
    f = StringIO(data)
    decompressed = ''
    runLength = ord(f.read(1))
    while runLength:
        if runLength < 128:
            decompressed += f.read(runLength + 1)
        if runLength > 128:
            decompressed += f.read(1) * (257 - runLength)
        if runLength == 128:
            break
        runLength = ord(f.read(1))
#    return sub(r'(\d+)(\D)', lambda m: m.group(2) * int(m.group(1)), data)
    return decompressed

#### LZW code sourced from pdfminer
# Copyright (c) 2004-2009 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

class LZWDecoder(object):
    def __init__(self, fp):
        self.fp = fp
        self.buff = 0
        self.bpos = 8
        self.nbits = 9
        self.table = None
        self.prevbuf = None
        return

    def readbits(self, bits):
        v = 0
        while 1:
            # the number of remaining bits we can get from the current buffer.
            r = 8-self.bpos
            if bits <= r:
                # |-----8-bits-----|
                # |-bpos-|-bits-|  |
                # |      |----r----|
                v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
                self.bpos += bits
                break
            else:
                # |-----8-bits-----|
                # |-bpos-|---bits----...
                # |      |----r----|
                v = (v<<r) | (self.buff & ((1<<r)-1))
                bits -= r
                x = self.fp.read(1)
                if not x: raise EOFError
                self.buff = ord(x)
                self.bpos = 0
        return v

    def feed(self, code):
        x = ''
        if code == 256:
            self.table = [ chr(c) for c in range(256) ] # 0-255
            self.table.append(None) # 256
            self.table.append(None) # 257
            self.prevbuf = ''
            self.nbits = 9
        elif code == 257:
            pass
        elif not self.prevbuf:
            x = self.prevbuf = self.table[code]
        else:
            if code < len(self.table):
                x = self.table[code]
                self.table.append(self.prevbuf+x[0])
            else:
                self.table.append(self.prevbuf+self.prevbuf[0])
                x = self.table[code]
            l = len(self.table)
            if l == 511:
                self.nbits = 10
            elif l == 1023:
                self.nbits = 11
            elif l == 2047:
                self.nbits = 12
            self.prevbuf = x
        return x

    def run(self):
        while 1:
            try:
                code = self.readbits(self.nbits)
            except EOFError:
                break
            x = self.feed(code)
            yield x
        return

####

def LZWDecode(data):
    return ''.join(LZWDecoder(StringIO(data)).run())

def PrintGenerateObject(object, options, newId=None):
    if newId == None:
        objectId = object.id
    else:
        objectId = newId
    dataPrecedingStream = object.ContainsStream()
    if dataPrecedingStream:
        if options.filter:
            decompressed = object.Stream(True, options.overridingfilters)
            if decompressed == 'No filters' or decompressed.startswith('Unsupported filter: '):
                print('    oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False, options.overridingfilters).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip())))
            else:
                dictionary = FormatOutput(dataPrecedingStream, True)
                dictionary = re.sub(r'/Length\s+\d+', '', dictionary)
                dictionary = re.sub(r'/Filter\s*/[a-zA-Z0-9]+', '', dictionary)
                dictionary = re.sub(r'/Filter\s*\[.+\]', '', dictionary)
                dictionary = re.sub(r'^\s*<<', '', dictionary)
                dictionary = re.sub(r'>>\s*$', '', dictionary)
                dictionary = dictionary.strip()
                print("    oPDF.stream2(%d, %d, %s, %s, 'f')" % (objectId, object.version, repr(decompressed.rstrip()), repr(dictionary)))
        else:
            print('    oPDF.stream(%d, %d, %s, %s)' % (objectId, object.version, repr(object.Stream(False, options.overridingfilters).rstrip()), repr(re.sub('/Length\s+\d+', '/Length %d', FormatOutput(dataPrecedingStream, True)).strip())))
    else:
        print('    oPDF.indirectobject(%d, %d, %s)' % (objectId, object.version, repr(FormatOutput(object.content, True).strip())))

def PrintObject(object, options):
    if options.generate:
        PrintGenerateObject(object, options)
    else:
        PrintOutputObject(object, options)

def File2Strings(filename):
    try:
        f = open(filename, 'r')
    except:
        return None
    try:
        return map(lambda line:line.rstrip('\n'), f.readlines())
    except:
        return None
    finally:
        f.close()

def ProcessAt(argument):
    if argument.startswith('@'):
        strings = File2Strings(argument[1:])
        if strings == None:
            raise Exception('Error reading %s' % argument)
        else:
            return strings
    else:
        return [argument]

def YARACompile(ruledata):
    if ruledata.startswith('#'):
        if ruledata.startswith('#h#'):
            rule = binascii.a2b_hex(ruledata[3:])
        elif ruledata.startswith('#b#'):
            rule = binascii.a2b_base64(ruledata[3:])
        elif ruledata.startswith('#s#'):
            rule = 'rule string {strings: $a = "%s" ascii wide nocase condition: $a}' % ruledata[3:]
        elif ruledata.startswith('#q#'):
            rule = ruledata[3:].replace("'", '"')
        elif ruledata.startswith('#x#'):
            rule = 'rule hexadecimal {strings: $a = { %s } condition: $a}' % ruledata[3:]
        elif ruledata.startswith('#r#'):
            rule = 'rule regex {strings: $a = /%s/ ascii wide nocase condition: $a}' % ruledata[3:]
        else:
            rule = ruledata[1:]
        return yara.compile(source=rule), rule
    else:
        dFilepaths = {}
        if os.path.isdir(ruledata):
            for root, dirs, files in os.walk(ruledata):
                for file in files:
                    filename = os.path.join(root, file)
                    dFilepaths[filename] = filename
        else:
            for filename in ProcessAt(ruledata):
                dFilepaths[filename] = filename
        return yara.compile(filepaths=dFilepaths), ','.join(dFilepaths.values())

def AddDecoder(cClass):
    global decoders

    decoders.append(cClass)

class cDecoderParent():
    pass

def GetScriptPath():
    if getattr(sys, 'frozen', False):
        return os.path.dirname(sys.executable)
    else:
        return os.path.dirname(sys.argv[0])

def LoadDecoders(decoders, verbose):
    if decoders == '':
        return
    scriptPath = GetScriptPath()
    for decoder in sum(map(ProcessAt, decoders.split(',')), []):
        try:
            if not decoder.lower().endswith('.py'):
                decoder += '.py'
            if os.path.dirname(decoder) == '':
                if not os.path.exists(decoder):
                    scriptDecoder = os.path.join(scriptPath, decoder)
                    if os.path.exists(scriptDecoder):
                        decoder = scriptDecoder
            exec(open(decoder, 'r').read(), globals(), globals())
        except Exception as e:
            print('Error loading decoder: %s' % decoder)
            if verbose:
                raise e

class cIdentity(cDecoderParent):
    name = 'Identity function decoder'

    def __init__(self, stream, options):
        self.stream = stream
        self.options = options
        self.available = True

    def Available(self):
        return self.available

    def Decode(self):
        self.available = False
        return self.stream

    def Name(self):
        return ''

def DecodeFunction(decoders, options, stream):
    if decoders == []:
        return stream
    return decoders[0](stream, options.decoderoptions).Decode()

class cDumpStream():
    def __init__(self):
        self.text = ''

    def Addline(self, line):
        if line != '':
            self.text += line + '\n'

    def Content(self):
        return self.text

def HexDump(data):
    oDumpStream = cDumpStream()
    hexDump = ''
    for i, b in enumerate(data):
        if i % dumplinelength == 0 and hexDump != '':
            oDumpStream.Addline(hexDump)
            hexDump = ''
        hexDump += IFF(hexDump == '', '', ' ') + '%02X' % ord(b)
    oDumpStream.Addline(hexDump)
    return oDumpStream.Content()

def CombineHexAscii(hexDump, asciiDump):
    if hexDump == '':
        return ''
    return hexDump + '  ' + (' ' * (3 * (dumplinelength - len(asciiDump)))) + asciiDump

def HexAsciiDump(data):
    oDumpStream = cDumpStream()
    hexDump = ''
    asciiDump = ''
    for i, b in enumerate(data):
        if i % dumplinelength == 0:
            if hexDump != '':
                oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump))
            hexDump = '%08X:' % i
            asciiDump = ''
        hexDump+= ' %02X' % ord(b)
        asciiDump += IFF(ord(b) >= 32, b, '.')
    oDumpStream.Addline(CombineHexAscii(hexDump, asciiDump))
    return oDumpStream.Content()

def HexAsciiDumpLine(data):
    return HexAsciiDump(data[0:16])[10:-1]

def ParseINIFile():
    oConfigParser = ConfigParser.ConfigParser(allow_no_value=True)
    oConfigParser.optionxform = str
    oConfigParser.read(os.path.join(GetScriptPath(), 'pdfid.ini'))
    keywords = []
    if oConfigParser.has_section('keywords'):
        for key, value in oConfigParser.items('keywords'):
            if not key in keywords:
                keywords.append(key)
    return keywords

def MatchObjectID(id, selection):
    return str(id) in selection.split(',')

def GetArguments():
    arguments = sys.argv[1:]
    envvar = os.getenv('PDFPARSER_OPTIONS')
    if envvar == None:
        return arguments
    return envvar.split(' ') + arguments

class cHashCRC32():
    def __init__(self):
        self.crc32 = None

    def update(self, data):
        self.crc32 = zlib.crc32(data)

    def hexdigest(self):
        return '%08x' % (self.crc32 & 0xffffffff)

class cHashChecksum8():
    def __init__(self):
        self.sum = 0

    def update(self, data):
        if sys.version_info[0] >= 3:
            self.sum += sum(data)
        else:
            self.sum += sum(map(ord, data))

    def hexdigest(self):
        return '%08x' % (self.sum)

dSpecialHashes = {'crc32': cHashCRC32, 'checksum8': cHashChecksum8}

def GetHashObjects(algorithms):
    global dSpecialHashes

    dHashes = {}

    if algorithms == '':
        algorithms = os.getenv('DSS_DEFAULT_HASH_ALGORITHMS', 'md5')
    if ',' in algorithms:
        hashes = algorithms.split(',')
    else:
        hashes = algorithms.split(';')
    for name in hashes:
        if not name in dSpecialHashes.keys() and not name in hashlib.algorithms_available:
            print('Error: unknown hash algorithm: %s' % name)
            print('Available hash algorithms: ' + ' '.join([name for name in list(hashlib.algorithms_available)] + list(dSpecialHashes.keys())))
            return [], {}
        elif name in dSpecialHashes.keys():
            dHashes[name] = dSpecialHashes[name]()
        else:
            dHashes[name] = hashlib.new(name)

    return hashes, dHashes

def CalculateChosenHash(data):
    hashes, dHashes = GetHashObjects('')
    dHashes[hashes[0]].update(data)
    return dHashes[hashes[0]].hexdigest(), hashes[0]

class cMyJSONOutput():

    def __init__(self):
        self.items = []
        self.counter = 1

    def AddIdItem(self, id, name, data):
        self.items.append({'id': id, 'name': name, 'content': binascii.b2a_base64(data).strip(b'\n').decode()})

    def AddItem(self, name, data):
        self.AddIdItem(self.counter, name, data)
        self.counter += 1

    def GetJSON(self):
        return json.dumps({'version': 2, 'id': 'didierstevens.com', 'type': 'content', 'fields': ['id', 'name', 'content'], 'items': self.items})

def Main():
    """pdf-parser, use it to parse a PDF document
    """

    global decoders

    oParser = optparse.OptionParser(usage='usage: %prog [options] pdf-file|zip-file|url\n' + __description__, version='%prog ' + __version__)
    oParser.add_option('-m', '--man', action='store_true', default=False, help='Print manual')
    oParser.add_option('-s', '--search', help='string to search in indirect objects (except streams)')
    oParser.add_option('-f', '--filter', action='store_true', default=False, help='pass stream object through filters (FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode and RunLengthDecode only)')
    oParser.add_option('-o', '--object', help='id(s) of indirect object(s) to select, use comma (,) to separate ids (version independent)')
    oParser.add_option('-r', '--reference', help='id of indirect object being referenced (version independent)')
    oParser.add_option('-e', '--elements', help='type of elements to select (cxtsi)')
    oParser.add_option('-w', '--raw', action='store_true', default=False, help='raw output for data and filters')
    oParser.add_option('-a', '--stats', action='store_true', default=False, help='display stats for pdf document')
    oParser.add_option('-t', '--type', help='type of indirect object to select')
    oParser.add_option('-O', '--objstm', action='store_true', default=False, help='parse stream of /ObjStm objects')
    oParser.add_option('-v', '--verbose', action='store_true', default=False, help='display malformed PDF elements')
    oParser.add_option('-x', '--extract', help='filename to extract malformed content to')
    oParser.add_option('-H', '--hash', action='store_true', default=False, help='display hash of objects')
    oParser.add_option('-n', '--nocanonicalizedoutput', action='store_true', default=False, help='do not canonicalize the output')
    oParser.add_option('-d', '--dump', help='filename to dump stream content to')
    oParser.add_option('-D', '--debug', action='store_true', default=False, help='display debug info')
    oParser.add_option('-c', '--content', action='store_true', default=False, help='display the content for objects without streams or with streams without filters')
    oParser.add_option('--searchstream', help='string to search in streams')
    oParser.add_option('--unfiltered', action='store_true', default=False, help='search in unfiltered streams')
    oParser.add_option('--casesensitive', action='store_true', default=False, help='case sensitive search in streams')
    oParser.add_option('--regex', action='store_true', default=False, help='use regex to search in streams')
    oParser.add_option('--overridingfilters', type=str, default='', help='override filters with given filters (use raw for the raw stream content)')
    oParser.add_option('-g', '--generate', action='store_true', default=False, help='generate a Python program that creates the parsed PDF file')
    oParser.add_option('--generateembedded', type=int, default=0, help='generate a Python program that embeds the selected indirect object as a file')
    oParser.add_option('-y', '--yara', help='YARA rule (or directory or @file) to check streams (can be used with option --unfiltered)')
    oParser.add_option('--yarastrings', action='store_true', default=False, help='Print YARA strings')
    oParser.add_option('--decoders', type=str, default='', help='decoders to load (separate decoders with a comma , ; @file supported)')
    oParser.add_option('--decoderoptions', type=str, default='', help='options for the decoder')
    oParser.add_option('-k', '--key', help='key to search in dictionaries')
    oParser.add_option('-j', '--jsonoutput', action='store_true', default=False, help='produce json output')
    (options, args) = oParser.parse_args(GetArguments())

    if options.man:
        oParser.print_help()
        PrintManual()
        return 0

    if len(args) != 1:
        oParser.print_help()
        print('')
        print('  %s' % __description__)
        print('  Source code put in the public domain by Didier Stevens, no Copyright')
        print('  Use at your own risk')
        print('  https://DidierStevens.com')

    else:
        decoders = []
        LoadDecoders(options.decoders, True)

        oPDFParser = cPDFParser(args[0], options.verbose, options.extract)
        cntComment = 0
        cntXref = 0
        cntTrailer = 0
        cntStartXref = 0
        cntIndirectObject = 0
        dicObjectTypes = {}
        objectsAll = set()
        objectsReferenced = set()
        objectsWithStream = []
        keywords = ['/JS', '/JavaScript', '/AA', '/OpenAction', '/AcroForm', '/RichMedia', '/Launch', '/EmbeddedFile', '/XFA', '/URI']
        for extrakeyword in ParseINIFile():
            if not extrakeyword in keywords:
                keywords.append(extrakeyword)

#        dKeywords = {keyword: [] for keyword in keywords}
# Done for compatibility with 2.6.6
        dKeywords = {}
        for keyword in keywords:
            dKeywords[keyword] = []

        selectComment = False
        selectXref = False
        selectTrailer = False
        selectStartXref = False
        selectIndirectObject = False
        if options.elements:
            for c in options.elements:
                if c == 'c':
                    selectComment = True
                elif c == 'x':
                    selectXref = True
                elif c == 't':
                    selectTrailer = True
                elif c == 's':
                    selectStartXref = True
                elif c == 'i':
                    selectIndirectObject = True
                else:
                    print('Error: unknown --elements value %s' % c)
                    return
        else:
            selectIndirectObject = True
            if not options.search and not options.object and not options.reference and not options.type and not options.searchstream and not options.key:
                selectComment = True
                selectXref = True
                selectTrailer = True
                selectStartXref = True
            if options.search or options.key or options.reference:
                selectTrailer = True

        if options.type == '-':
            optionsType = ''
        else:
            optionsType = options.type

        if options.generate or options.generateembedded != 0:
            savedRoot = ['1', '0', 'R']
            print('#!/usr/bin/python')
            print('')
            print('"""')
            print('')
            print('Program generated by pdf-parser.py by Didier Stevens')
            print('https://DidierStevens.com')
            print('Use at your own risk')
            print('')
            print('Input PDF file: %s' % args[0])
            print('This Python program was created on: %s' % Timestamp())
            print('')
            print('"""')
            print('')
            print('import mPDF')
            print('import sys')
            print('')
            print('def Main():')
            print('    if len(sys.argv) != 2:')
            print("        print('Usage: %s pdf-file' % sys.argv[0])")
            print('        return')
            print('    oPDF = mPDF.cPDF(sys.argv[1])')

        if options.generateembedded != 0:
            print("    oPDF.header('1.1')")
            print(r"    oPDF.comment('\xd0\xd0\xd0\xd0')")
            print(r"    oPDF.indirectobject(1, 0, '<<\r\n /Type /Catalog\r\n /Outlines 2 0 R\r\n /Pages 3 0 R\r\n /Names << /EmbeddedFiles << /Names [(test.bin) 7 0 R] >> >>\r\n>>')")
            print(r"    oPDF.indirectobject(2, 0, '<<\r\n /Type /Outlines\r\n /Count 0\r\n>>')")
            print(r"    oPDF.indirectobject(3, 0, '<<\r\n /Type /Pages\r\n /Kids [4 0 R]\r\n /Count 1\r\n>>')")
            print(r"    oPDF.indirectobject(4, 0, '<<\r\n /Type /Page\r\n /Parent 3 0 R\r\n /MediaBox [0 0 612 792]\r\n /Contents 5 0 R\r\n /Resources <<\r\n             /ProcSet [/PDF /Text]\r\n             /Font << /F1 6 0 R >>\r\n            >>\r\n>>')")
            print(r"    oPDF.stream(5, 0, 'BT /F1 12 Tf 70 700 Td 15 TL (This PDF document embeds file test.bin) Tj ET', '<< /Length %d >>')")
            print(r"    oPDF.indirectobject(6, 0, '<<\r\n /Type /Font\r\n /Subtype /Type1\r\n /Name /F1\r\n /BaseFont /Helvetica\r\n /Encoding /MacRomanEncoding\r\n>>')")
            print(r"    oPDF.indirectobject(7, 0, '<<\r\n /Type /Filespec\r\n /F (test.bin)\r\n /EF << /F 8 0 R >>\r\n>>')")

        if options.yara != None:
            if not 'yara' in sys.modules:
                print('Error: option yara requires the YARA Python module.')
                return
            rules, rulesVerbose = YARACompile(options.yara)
            if options.verbose:
                print(rulesVerbose)

        oPDFParserOBJSTM = None
        oMyJSONOutput = cMyJSONOutput()
        while True:
            if oPDFParserOBJSTM == None:
                object = oPDFParser.GetObject()
            else:
                object = oPDFParserOBJSTM.GetObject()
                if object == None:
                    oPDFParserOBJSTM = None
                    object = oPDFParser.GetObject()
            if options.objstm and hasattr(object, 'GetType') and EqualCanonical(object.GetType(), '/ObjStm') and object.ContainsStream():
                # parsing objects inside an /ObjStm object by extracting & parsing the stream content to create a synthesized PDF document, that is then parsed by cPDFParser
                oPDFParseDictionary = cPDFParseDictionary(object.ContainsStream(), options.nocanonicalizedoutput)
                numberOfObjects = int(oPDFParseDictionary.Get('/N')[0])
                offsetFirstObject = int(oPDFParseDictionary.Get('/First')[0])
                indexes = list(map(int, C2SIP3(object.Stream())[:offsetFirstObject].strip().replace('\n', ' ').split(' ')))
                if len(indexes) % 2 != 0 or len(indexes) / 2 != numberOfObjects:
                    raise Exception('Error in index of /ObjStm stream')
                streamObject = C2SIP3(object.Stream()[offsetFirstObject:])
                synthesizedPDF = ''
                while len(indexes) > 0:
                    objectNumber = indexes[0]
                    offset = indexes[1]
                    indexes = indexes[2:]
                    if len(indexes) >= 2:
                        offsetNextObject = indexes[1]
                    else:
                        offsetNextObject = len(streamObject)
                    synthesizedPDF += '%d 0 obj\n%s\nendobj\n' % (objectNumber, streamObject[offset:offsetNextObject])
                oPDFParserOBJSTM = cPDFParser(StringIO(synthesizedPDF), options.verbose, options.extract, (object.id, object.version))
            if object != None:
                if options.stats:
                    if object.type == PDF_ELEMENT_COMMENT:
                        cntComment += 1
                    elif object.type == PDF_ELEMENT_XREF:
                        cntXref += 1
                    elif object.type == PDF_ELEMENT_TRAILER:
                        cntTrailer += 1
                        oPDFParseDictionary = cPDFParseDictionary(object.content[1:], options.nocanonicalizedoutput)
                        for keyTrailer, valueTrailer in oPDFParseDictionary.parsed:
                            if len(valueTrailer) == 3 and valueTrailer[2] == 'R' and IsNumeric(valueTrailer[0]) and IsNumeric(valueTrailer[1]):
                                objectsReferenced.add(tuple(valueTrailer))
                    elif object.type == PDF_ELEMENT_STARTXREF:
                        cntStartXref += 1
                    elif object.type == PDF_ELEMENT_INDIRECT_OBJECT:
                        cntIndirectObject += 1
                        type1 = object.GetType()
                        if not type1 in dicObjectTypes:
                            dicObjectTypes[type1] = [object.id]
                        else:
                            dicObjectTypes[type1].append(object.id)
                        for keyword in dKeywords.keys():
                            if object.ContainsName(keyword):
                                dKeywords[keyword].append(object.id)
                        if object.ContainsStream():
                            objectsWithStream.append(object.id)
                        for reference in object.GetReferences():
                            objectsReferenced.add(reference)
                        objectsAll.add((str(object.id), str(object.version), 'R'))
                elif options.jsonoutput:
                    if object.type == PDF_ELEMENT_INDIRECT_OBJECT:
                        if object.ContainsStream():
                            filtered = object.Stream(options.filter == True, options.overridingfilters)
                            if filtered == []:
                                filtered = ''
                            oMyJSONOutput.AddItem('obj %s %s' % (object.id, object.version), C2BIP3(filtered))
                else:
                    if object.type == PDF_ELEMENT_COMMENT and selectComment:
                        if options.generate:
                            comment = object.comment[1:].rstrip()
                            if re.match('PDF-\d\.\d', comment):
                                print("    oPDF.header('%s')" % comment[4:])
                            elif comment != '%EOF':
                                print('    oPDF.comment(%s)' % repr(comment))
                        elif options.yara == None and options.generateembedded == 0:
                            print('PDF Comment %s' % FormatOutput(object.comment, options.raw))
                            print('')
                    elif object.type == PDF_ELEMENT_XREF and selectXref:
                        if not options.generate and options.yara == None and options.generateembedded == 0:
                            if options.debug:
                                print('xref %s' % FormatOutput(object.content, options.raw))
                            else:
                                print('xref')
                            print('')
                    elif object.type == PDF_ELEMENT_TRAILER and selectTrailer:
                        oPDFParseDictionary = cPDFParseDictionary(object.content[1:], options.nocanonicalizedoutput)
                        if options.generate:
                            result = oPDFParseDictionary.Get('/Root')
                            if result != None:
                                savedRoot = result
                        elif options.yara == None and options.generateembedded == 0:
                            if not options.search and not options.key and not options.reference or options.search and object.Contains(options.search):
                                if oPDFParseDictionary == None:
                                    print('trailer %s' % FormatOutput(object.content, options.raw))
                                else:
                                    print('trailer')
                                    oPDFParseDictionary.PrettyPrint('  ')
                                print('')
                            elif options.key:
                                if oPDFParseDictionary.parsed != None:
                                    result = oPDFParseDictionary.GetNested(options.key)
                                    if result != None:
                                        print(result)
                            elif options.reference:
                                for key, value in oPDFParseDictionary.Retrieve():
                                    if value == [str(options.reference), '0', 'R']:
                                        print('trailer')
                                        oPDFParseDictionary.PrettyPrint('  ')
                    elif object.type == PDF_ELEMENT_STARTXREF and selectStartXref:
                        if not options.generate and options.yara == None and options.generateembedded == 0:
                            print('startxref %d' % object.index)
                            print('')
                    elif object.type == PDF_ELEMENT_INDIRECT_OBJECT and selectIndirectObject:
                        if options.search:
                            if object.Contains(options.search):
                                PrintObject(object, options)
                        elif options.key:
                            contentDictionary = object.ContainsStream()
                            if not contentDictionary:
                                contentDictionary = object.content[1:]
                            oPDFParseDictionary = cPDFParseDictionary(contentDictionary, options.nocanonicalizedoutput)
                            if oPDFParseDictionary.parsed != None:
                                result = oPDFParseDictionary.GetNested(options.key)
                                if result != None:
                                    print(result)
                        elif options.object:
                            if MatchObjectID(object.id, options.object):
                                PrintObject(object, options)
                        elif options.reference:
                            if object.References(options.reference):
                                PrintObject(object, options)
                        elif options.type:
                            if EqualCanonical(object.GetType(), optionsType):
                                PrintObject(object, options)
                        elif options.hash:
                            print('obj %d %d' % (object.id, object.version))
                            rawContent = FormatOutput(object.content, True)
                            hashHexdigest, hashAlgo = CalculateChosenHash(rawContent.encode('latin'))
                            print(' len: %d %s: %s' % (len(rawContent), hashAlgo, hashHexdigest))
                            print('')
                        elif options.searchstream:
                            if object.StreamContains(options.searchstream, not options.unfiltered, options.casesensitive, options.regex, options.overridingfilters):
                                PrintObject(object, options)
                        elif options.yara != None:
                            results = object.StreamYARAMatch(rules, decoders, options.decoderoptions, not options.unfiltered, options.overridingfilters)
                            if results != None and results != []:
                                for result in results:
                                    for yaraResult in result[1]:
                                        print('YARA rule%s: %s (%s)' % (IFF(result[0] == '', '', ' (stream decoder: %s)' % result[0]), yaraResult.rule, yaraResult.namespace))
                                        if options.yarastrings:
                                            for stringdata in yaraResult.strings:
                                                print('%06x %s:' % (stringdata[0], stringdata[1]))
                                                print(' %s' % binascii.hexlify(C2BIP3(stringdata[2])))
                                                print(' %s' % repr(stringdata[2]))
                                    PrintObject(object, options)
                        elif options.generateembedded != 0:
                            if object.id == options.generateembedded:
                                PrintGenerateObject(object, options, 8)
                        else:
                            PrintObject(object, options)
                    elif object.type == PDF_ELEMENT_MALFORMED:
                        try:
                            fExtract = open(options.extract, 'wb')
                            try:
                                fExtract.write(C2BIP3(object.content))
                            except:
                                print('Error writing file %s' % options.extract)
                            fExtract.close()
                        except:
                            print('Error writing file %s' % options.extract)
            else:
                break

        if options.stats:
            print('Comment: %s' % cntComment)
            print('XREF: %s' % cntXref)
            print('Trailer: %s' % cntTrailer)
            print('StartXref: %s' % cntStartXref)
            print('Indirect object: %s' % cntIndirectObject)
            print('Indirect objects with a stream: %s' % ', '.join([str(id) for id in objectsWithStream]))
            objectsUnreferenced = objectsAll - objectsReferenced
            for key in sorted(dicObjectTypes.keys()):
                print(' %s %d: %s' % (key, len(dicObjectTypes[key]), ', '.join(map(lambda x: '%d' % x, dicObjectTypes[key]))))
            if len(objectsUnreferenced) > 0:
                print('Unreferenced indirect objects: %s' % ', '.join([' '.join(reference) for reference in sorted(objectsUnreferenced, key=lambda a: int(a[0]))]))
                if '/ObjStm' in dicObjectTypes:
                    objectsUnreferencedMinusObjStm = set()
                    for unreferencedObject in objectsUnreferenced:
                        if not int(unreferencedObject[0]) in dicObjectTypes['/ObjStm']:
                            objectsUnreferencedMinusObjStm.add(unreferencedObject)
                    print('Unreferenced indirect objects without /ObjStm objects: %s' % ', '.join([' '.join(reference) for reference in sorted(objectsUnreferencedMinusObjStm, key=lambda a: int(a[0]))]))
            if sum(map(len, dKeywords.values())) > 0:
                print('Search keywords:')
                for keyword in keywords:
                    if len(dKeywords[keyword]) > 0:
                        print(' %s %d: %s' % (keyword, len(dKeywords[keyword]), ', '.join(map(lambda x: '%d' % x, dKeywords[keyword]))))

        if options.jsonoutput:
            print(oMyJSONOutput.GetJSON())

        if options.generate or options.generateembedded != 0:
            print("    oPDF.xrefAndTrailer('%s')" % ' '.join(savedRoot))
            print('')
            print("if __name__ == '__main__':")
            print('    Main()')

def TestPythonVersion(enforceMaximumVersion=False, enforceMinimumVersion=False):
    if sys.version_info[0:3] > __maximum_python_version__:
        if enforceMaximumVersion:
            print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
            print('Please use Python version %d.%d.%d' % __maximum_python_version__)
            sys.exit()
        else:
            print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
            print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__)
    if sys.version_info[0:3] < __minimum_python_version__:
        if enforceMinimumVersion:
            print('This program does not work with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
            print('Please use Python version %d.%d.%d' % __maximum_python_version__)
            sys.exit()
        else:
            print('This program has not been tested with this version of Python (%d.%d.%d)' % sys.version_info[0:3])
            print('Should you encounter problems, please use Python version %d.%d.%d' % __maximum_python_version__)

if __name__ == '__main__':
    TestPythonVersion()
    Main()