Skip to content

Commit 1f561c2

Browse files
committed
Basic JPEG file extract
1 parent dd13fb5 commit 1f561c2

File tree

1 file changed

+109
-68
lines changed

1 file changed

+109
-68
lines changed

PythonFileFormats/JPEG.py

Lines changed: 109 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ def bytesToInt(bytes, alignmentIndicator, signed=False) :
1515
return i
1616

1717
def bytesToASCIIString(bytes) :
18-
s = bytes.decode()
19-
return s
18+
# Remove trailing null used in IFD string elements
19+
if bytes[-1] == 0x00 :
20+
bytes = bytes[0:len(bytes)-1]
21+
return bytes.decode()
2022

2123
# Extract first n bytes up to a 0 byte, expect this to be an ASCII string identifying the type of App Segment, e.g. "Exif"
2224
def getAppSegmentIdentifier(segment) :
@@ -83,7 +85,7 @@ def readEntropyCodedDataSegment(f) :
8385
#
8486

8587
#http://gvsoft.no-ip.org/exif/exif-explanation.html
86-
def processExifSegment(dict, info, segment) :
88+
def processExifSegment(info, segment) :
8789

8890
# Expect first six bytes to be 'Exif\x00\x00'
8991
ExifIdentifierLength = 6
@@ -106,38 +108,79 @@ def processExifSegment(dict, info, segment) :
106108
firstIFDOffset = bytesToInt(TIFFHeader[4:8], byteAlignmentIndicator)
107109
#print(firstIFDOffset)
108110

109-
# Then a chained set of IFD blocks, which sometimes contain embedded pointers to further specialised IFD blocks,
110-
# which we record and look at after processing the main chain of IFDs.
111-
embeddedIFDOffsets = [] # List of (IFD type, offset) tuples
112111
nextIFDOffset = firstIFDOffset
113112
IFDCount = 0
113+
114+
# Dictionary to record each IFD, keyed an IFD name, storing the detailed IFD dictionary as the value
115+
dict = {}
116+
114117
while nextIFDOffset != 0 :
118+
IFDname = "IFD" + str(IFDCount)
119+
#print("Handling main chain IFD:", IFDname)
120+
IFDentries, nextIFDOffset = processIFD(TIFF, nextIFDOffset, byteAlignmentIndicator)
121+
dict[IFDname] = IFDentries
115122
IFDCount += 1
116-
print("Handling main chain IFD:", IFDCount)
117-
entries, nextIFDOffset = processIFD(TIFF, nextIFDOffset, byteAlignmentIndicator)
118-
# Look for IFD elements known to indicate an embedded IFD offset
119-
for entry in entries:
120-
if entry['tag'] in [34853, 34665] :
121-
embeddedIFDOffsets.append( (entry['tag'], entry['value']) )
122-
dict[entry['tag']] = entry
123-
# ???? Can a tag be repeated across the set of IFDs ? If so what to do - put them in a list ?
124-
# ???? NB Issue affecrs embeddedIFDOffsets method too, perhaps handle in there as well
125-
126-
if len(embeddedIFDOffsets) > 0 :
127-
print("Found embedded IFDs within this IFD:", embeddedIFDOffsets)
128-
for (id, os) in embeddedIFDOffsets :
129-
print("Looking at embedded IFD:", id)
130-
# ???? Need to store results. Do we ever expect a non-zero next-in-chain value ?
131-
processIFD(TIFF, os, byteAlignmentIndicator)
132123

124+
# Search for embedded IFD elements within the IFDs we've already identified. Assume only one embedded IFD
125+
# of each type. IFDs can be nested by more than one level, so keep going as long as we find a new IFDs
126+
127+
continueLooking = True
128+
while(continueLooking) :
129+
newIFDinfo = []
130+
for knownIFDname, d in dict.items() :
131+
for embeddedIFDtag, embeddedIFDname in knownEmbeddedIFDs().items() :
132+
# This will re-search all IFDs each time through the loop, not just ones we've added last time
133+
# around, so ignore embedded IFDs we've already picked up. (Assuming the only exist in one place.)
134+
if embeddedIFDtag in d and embeddedIFDname not in dict:
135+
IFDname = embeddedIFDname
136+
#print("Handling embedded IFD:", IFDname)
137+
embeddedIFDOffset = d[embeddedIFDtag]['value']
138+
embeddedIFDentries, nextIFDOffset = processIFD(TIFF, embeddedIFDOffset, byteAlignmentIndicator)
139+
# Put info about embedded IFD onto a list, we can't put it directly in the main dictionary
140+
# while looping over the dictionary,
141+
newIFDinfo.append( (embeddedIFDname, embeddedIFDentries) )
142+
if nextIFDOffset != 0000 :
143+
print("*** - unexpected next IFD offset in IFD", embeddedIFDname)
144+
# Can now add the new IFD(s) to the main dictionary
145+
for additionalIFDname, IFDentries in newIFDinfo :
146+
dict[additionalIFDname] = IFDentries
147+
continueLooking = len(newIFDinfo) > 0
148+
149+
return dict
150+
151+
def knownEmbeddedIFDs() :
152+
return {
153+
34665 : "Exif",
154+
34853 : "GPS",
155+
40965 : "Interoperability"
156+
}
157+
158+
# Add an IFD element (itself a dictionary) to an IFD-level dictionary. If the element's tag does not already exist in the
159+
# IFD dictionary just add the element. If it does, add the element to a list associated with the tag instead.
160+
def addToIFDDictionary (IFDdict, element) :
161+
tag = element['tag']
162+
if tag not in IFDdict :
163+
IFDdict[tag] = element
164+
else :
165+
currentValue = IFDdict[tag]
166+
#print("tag already in dict:", tag, type(currentValue))
167+
if type(currentValue) is dict :
168+
# Convert to a list of elements
169+
IFDdict['tag'] = [currentValue, element]
170+
#print("... converted to a list: ", IFDdict['tag'])
171+
elif type(currentValue) is list :
172+
# Already multiple entries for this tag - append to it
173+
IFDdict['tag'].append(element)
174+
#print("... appended to existing list: ", IFDdict['tag'])
175+
133176
# Each IFD (Image File Directory) consists of:
134177
# - a two-byte int giving the number of directory elements
135178
# - the 12-byte elements
136179
# - a four-byte offset to the start of the next IFD in this chain, or 0000 if the end of the chain
137180
def processIFD(TIFF, IFDOffset, byteAlignmentIndicator) :
138181

139182
# List of dictionaries for output, one per IFD element
140-
IFDEntries = []
183+
IFDEntries = {}
141184

142185
IFDBytes = TIFF[IFDOffset:]
143186
# 2 byte value indicating the number of elements
@@ -150,7 +193,7 @@ def processIFD(TIFF, IFDOffset, byteAlignmentIndicator) :
150193
for n in range (0, elementCount) :
151194
thisElementBytes = elementBytes[elementSize*n : elementSize*(n+1)]
152195
element = processIFDElement(n, thisElementBytes, TIFF, byteAlignmentIndicator)
153-
IFDEntries.append(element)
196+
addToIFDDictionary (IFDEntries, element)
154197

155198
# The final four bytes are either an offset to the next IFD in the chain, or 0000 if no more IFDs in this chain
156199
nextOffsetBytesPosition = 2+elementSize*elementCount
@@ -170,106 +213,101 @@ def processIFDElement(elementNo, element, TIFF, byteAlignmentIndicator) :
170213

171214
tag = bytesToInt(element[0:2], byteAlignmentIndicator)
172215
dataFormat = bytesToInt(element[2:4], byteAlignmentIndicator)
173-
numComponents = bytesToInt(element[4:8], byteAlignmentIndicator)
216+
componentCount = bytesToInt(element[4:8], byteAlignmentIndicator)
174217
dataBytes = element[8:12]
175218
dataBytesAsOffset = bytesToInt(dataBytes, byteAlignmentIndicator)
176219

177220
implemented = True
178221
dataValue = "-"
179222
# 1 = unsigned byte, 1 byte per component, not implemented
180223
if dataFormat == 1 :
181-
if numComponents == 1 :
224+
if componentCount == 1 :
182225
dataValue = bytesToInt(dataBytes[0:1], byteAlignmentIndicator)
183-
elif numComponents <= 4:
226+
elif componentCount <= 4:
184227
dataValue = []
185-
for i in range (0, numComponents) :
228+
for i in range (0, componentCount) :
186229
dataValue.append(bytesToInt(dataBytes[i:i+1], byteAlignmentIndicator))
187-
elif numComponents > 4 :
230+
elif componentCount > 4 :
188231
dataValue = []
189-
for i in range (0, numComponents) :
232+
for i in range (0, componentCount) :
190233
offset = dataBytesAsOffset + i
191234
dataValue.append(bytesToInt(TIFF[offset:offset+1], byteAlignmentIndicator))
192-
print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(ubyte), num:", numComponents, ", val:", dataValue)
235+
#print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(ubyte), num:", componentCount, ", val:", dataValue)
193236
# 2 = ASCII string, 1 byte per character
194237
elif dataFormat == 2 :
195-
if numComponents <= 4:
196-
dataValue = bytesToASCIIString(dataBytes[0:numComponents])
238+
if componentCount <= 4:
239+
dataValue = bytesToASCIIString(dataBytes[0:componentCount])
197240
else :
198-
dataValue = bytesToASCIIString(TIFF[dataBytesAsOffset:dataBytesAsOffset+numComponents])
199-
print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(String), num:", numComponents, ", val:", dataValue)
241+
dataValue = bytesToASCIIString(TIFF[dataBytesAsOffset:dataBytesAsOffset+componentCount])
242+
#print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(String), num:", componentCount, ", val:", dataValue)
200243
# 3 = unsigned short, 2 bytes per component
201244
elif dataFormat == 3 :
202-
if numComponents == 1 :
245+
if componentCount == 1 :
203246
dataValue = bytesToInt(dataBytes[0:2], byteAlignmentIndicator)
204-
elif numComponents == 2:
247+
elif componentCount == 2:
205248
dataValue = [ bytesToInt(dataBytes[0:2], byteAlignmentIndicator), bytesToInt(dataBytes[2:4], byteAlignmentIndicator) ]
206-
elif numComponents > 2 :
249+
elif componentCount > 2 :
207250
dataValue = []
208-
for i in range (0, numComponents) :
251+
for i in range (0, componentCount) :
209252
offset = dataBytesAsOffset + i*2
210253
dataValue.append(bytesToInt(TIFF[offset:offset+2], byteAlignmentIndicator))
211-
print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(ushort), num:", numComponents, ", val:", dataValue)
254+
#print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(ushort), num:", componentCount, ", val:", dataValue)
212255
# 4 = unsigned long, 4 bytes per component
213256
elif dataFormat in [4, 9] :
214257
signed = dataFormat == 9
215258
desc = "(ulong)" if dataFormat == 4 else "(long)"
216-
if numComponents == 1 :
259+
if componentCount == 1 :
217260
dataValue = bytesToInt(dataBytes[0:4], byteAlignmentIndicator, signed)
218-
elif numComponents > 1 :
261+
elif componentCount > 1 :
219262
dataValue = []
220-
for i in range (0, numComponents) :
263+
for i in range (0, componentCount) :
221264
offset = dataBytesAsOffset + i*4
222265
dataValue.append(bytesToInt(TIFF[offset:offset+2], byteAlignmentIndicator, signed))
223-
print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, desc, ", num:", numComponents, ", val:", dataValue)
266+
#print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, desc, ", num:", componentCount, ", val:", dataValue)
224267
elif dataFormat in [5, 10] :
225268
signed = dataFormat == 10
226269
desc = "(urational)" if dataFormat == 5 else "(rational)"
227270
values = []
228-
for i in range (0, numComponents) :
271+
for i in range (0, componentCount) :
229272
offset = dataBytesAsOffset + i*8
230273
numerator = bytesToInt(TIFF[offset:offset+4], byteAlignmentIndicator, signed)
231274
denominator = bytesToInt(TIFF[offset+4:offset+8], byteAlignmentIndicator, signed)
232275
values.append( (numerator, denominator) )
233-
if numComponents == 1 :
276+
if componentCount == 1 :
234277
dataValue = values[0]
235278
else :
236279
dataValue = values
237-
print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, desc, ", num:", numComponents, ", val:", dataValue)
238-
# 7 = General purpose undefined. 1 byte per component
280+
#print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, desc, ", num:", componentCount, ", val:", dataValue)
281+
# 7 = General purpose 'undefined' type. 1 byte per component
239282
elif dataFormat == 7 :
240-
if numComponents == 1 :
283+
if componentCount == 1 :
241284
dataValue = dataBytes[0:1]
242-
elif numComponents <= 4:
285+
elif componentCount <= 4:
243286
dataValue = []
244-
for i in range (0, numComponents) :
287+
for i in range (0, componentCount) :
245288
dataValue.append(dataBytes[i:i+1])
246-
elif numComponents > 4 :
289+
elif componentCount > 4 :
247290
dataValue = []
248-
for i in range (0, numComponents) :
291+
for i in range (0, componentCount) :
249292
offset = dataBytesAsOffset + i
250293
dataValue.append(TIFF[offset:offset+1])
251-
print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(undefined), num:", numComponents, ", val:", dataValue[0:12])
294+
#print(".. IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, "(undefined), num:", componentCount, ", val:", dataValue[0:12])
252295
else :
253296
implemented = False
254297

255-
# ???? Remove prints
256-
# Fill in dictionary some more, allow for repeated values ?
257-
# Caller to handle different IFDs vs repeated values in different IFDs ????
258-
# NB Thumbnail vs full image ????
259-
260-
# Structure to allow easy look by tag ?
261-
298+
# Put values for the element into a dictionary and return it
262299
entry = {}
263300
entry['tag'] = tag
301+
entry['index'] = elementNo
302+
entry['format'] = dataFormat
303+
entry['count'] = componentCount
264304
entry['value'] = dataValue
265305

266-
if implemented :
267-
pass
268-
else :
269-
print("*** IFD data type not implemented: IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, ", num:", numComponents, ", bytes:", dataBytes)
270-
271-
return entry
306+
if not implemented :
307+
entry['unhandled'] = True
308+
print("*** IFD data type not implemented: IFD item no:", elementNo, "tag:", tag, ", dataFormat:", dataFormat, ", num:", componentCount, ", bytes:", dataBytes)
272309

310+
return entry
273311

274312
#
275313
#############################################
@@ -419,7 +457,10 @@ def processFile(filename) :
419457
appName = info['app']
420458
dict = {}
421459
if appName == "Exif" :
422-
processExifSegment(dict, info, data)
460+
ExifDict = processExifSegment(info, data)
461+
print("Extracted these IFDs from the Exif segment:")
462+
for n, d in ExifDict.items() :
463+
print("- ", n, ":", len(d), "item(s)")
423464
elif appName == "JFIF" :
424465
processJFIFSegment(dict, info, data)
425466
elif appName == "ICC_PROFILE" :

0 commit comments

Comments
 (0)