secure-systems-lab · awwad · Sep 27, 2018 · Sep 26, 2018 · Sep 27, 2018 · Sep 27, 2018
diff --git a/securesystemslib/hash.py b/securesystemslib/hash.py
@@ -41,6 +41,7 @@
 # Import securesystemslib logger to log warning messages.
 logger = logging.getLogger('securesystemslib.hash')
 
+DEFAULT_CHUNK_SIZE = 4096
 DEFAULT_HASH_ALGORITHM = 'sha256'
 DEFAULT_HASH_LIBRARY = 'hashlib'
 SUPPORTED_LIBRARIES = ['hashlib']
@@ -125,7 +126,7 @@ def digest(algorithm=DEFAULT_HASH_ALGORITHM, hash_library=DEFAULT_HASH_LIBRARY):
 
 
 def digest_fileobject(file_object, algorithm=DEFAULT_HASH_ALGORITHM,
-    hash_library=DEFAULT_HASH_LIBRARY):
+    hash_library=DEFAULT_HASH_LIBRARY, normalize_line_endings=False):
   """
   <Purpose>
     Generate a digest object given a file object.  The new digest object
@@ -143,6 +144,14 @@ def digest_fileobject(file_object, algorithm=DEFAULT_HASH_ALGORITHM,
     hash_library:
       The library providing the hash algorithms (e.g., 'hashlib').
 
+    normalize_line_endings: (default False)
+      Whether or not to normalize line endings for cross-platform support.
+      Note that this results in ambiguous hashes (e.g. 'abc\n' and 'abc\r\n'
+      will produce the same hash), so be careful to only apply this to text
+      files (not binary), when that equivalence is desirable and cannot result
+      in easily-maliciously-corrupted files producing the same hash as a valid
+      file.
+
   <Exceptions>
     securesystemslib.exceptions.FormatError, if the arguments are
     improperly formatted.
@@ -179,11 +188,26 @@ def digest_fileobject(file_object, algorithm=DEFAULT_HASH_ALGORITHM,
   # Update the hash with the data read from each chunk and return after
   # the entire file is processed.
   while True:
-    chunksize = 4096
-    data = file_object.read(chunksize)
+    data = file_object.read(DEFAULT_CHUNK_SIZE)
     if not data:
       break
 
+    if normalize_line_endings:
+      while data[-1:] == b'\r':
+        c = file_object.read(1)
+        if not c:
+          break
+
+        data += c
+
+      data = (
+          data
+          # First Windows
+          .replace(b'\r\n', b'\n')
+          # Then Mac
+          .replace(b'\r', b'\n')
+      )
+
     if not isinstance(data, six.binary_type):
       digest_object.update(data.encode('utf-8'))
 
@@ -197,7 +221,7 @@ def digest_fileobject(file_object, algorithm=DEFAULT_HASH_ALGORITHM,
 
 
 def digest_filename(filename, algorithm=DEFAULT_HASH_ALGORITHM,
-    hash_library=DEFAULT_HASH_LIBRARY):
+    hash_library=DEFAULT_HASH_LIBRARY, normalize_line_endings=False):
   """
   <Purpose>
     Generate a digest object, update its hash using a file object
@@ -213,6 +237,9 @@ def digest_filename(filename, algorithm=DEFAULT_HASH_ALGORITHM,
     hash_library:
       The library providing the hash algorithms (e.g., 'hashlib').
 
+    normalize_line_endings:
+      Whether or not to normalize line endings for cross-platform support.
+
   <Exceptions>
     securesystemslib.exceptions.FormatError, if the arguments are
     improperly formatted.
@@ -243,6 +270,7 @@ def digest_filename(filename, algorithm=DEFAULT_HASH_ALGORITHM,
     # digest_fileobject() raises:
     # securesystemslib.exceptions.UnsupportedAlgorithmError
     # securesystemslib.exceptions.UnsupportedLibraryError
-    digest_object = digest_fileobject(file_object, algorithm, hash_library)
+    digest_object = digest_fileobject(
+        file_object, algorithm, hash_library, normalize_line_endings)
 
   return digest_object
diff --git a/tests/test_hash.py b/tests/test_hash.py
@@ -224,6 +224,28 @@ def _do_update_filename(self, library):
         os.remove(filename)
 
 
+  def test_update_filename_normalize(self):
+    self._run_with_all_hash_libraries(self._do_update_filename_normalize)
+
+
+  def _do_update_filename_normalize(self, library):
+    data = b'ab\r\nd\nf\r' * 4096
+    normalized_data = data.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
+    fd, filename = tempfile.mkstemp()
+    try:
+      os.write(fd, data)
+      os.close(fd)
+      for algorithm in ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512']:
+        digest_object_truth = securesystemslib.hash.digest(algorithm, library)
+        digest_object_truth.update(normalized_data)
+        digest_object = securesystemslib.hash.digest_filename(filename,
+            algorithm, library, normalize_line_endings=True)
+        self.assertEqual(digest_object_truth.digest(), digest_object.digest())
+
+    finally:
+      os.remove(filename)
+
+
   def test_update_file_obj(self):
     self._run_with_all_hash_libraries(self._do_update_file_obj)