22import shutil
33from operator import attrgetter
44from pathlib import Path
5+ from statistics import mean
56from typing import Iterable , List , Optional , Sequence , Set , Tuple , Type , Union
67
78import attr
3435from .pool import make_pool
3536from .report import (
3637 CalculateMultiFileExceptionReport ,
38+ EntropyMeasurements ,
3739 EntropyReport ,
3840 ExtractDirectoryExistsReport ,
3941 FileMagicReport ,
@@ -563,8 +565,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
563565 logger .debug (
564566 "Entropy chart" ,
565567 # New line so that chart title will be aligned correctly in the next line
566- chart = "\n "
567- + format_entropy_plot (report .percentages , report .block_size ),
568+ chart = "\n " + format_entropy_plot (report ),
568569 path = path ,
569570 _verbosity = 3 ,
570571 )
@@ -709,7 +710,8 @@ def calculate_entropy(path: Path) -> EntropyReport:
709710 can contain 0-8 bits of entropy. We normalize it for visualization to a
710711 0-100% scale, to make it easier to interpret the graph.
711712 """
712- percentages = []
713+ shannon_percentages = []
714+ chi_square_percentages = []
713715
714716 # We could use the chunk size instead of another syscall,
715717 # but we rely on the actual file size written to the disk
@@ -725,31 +727,52 @@ def calculate_entropy(path: Path) -> EntropyReport:
725727 max_limit = 1024 * 1024 ,
726728 )
727729
728- entropy_sum = 0.0
729730 with File .from_path (path ) as file :
730731 for chunk in iterate_file (file , 0 , file_size , buffer_size = block_size ):
731- entropy = mt .shannon_entropy (chunk )
732- entropy_percentage = round (entropy / 8 * 100 , 2 )
733- percentages .append (entropy_percentage )
734- entropy_sum += entropy * len (chunk )
735-
736- report = EntropyReport (
737- percentages = percentages ,
738- block_size = block_size ,
739- mean = entropy_sum / file_size / 8 * 100 ,
732+ shannon_entropy = mt .shannon_entropy (chunk )
733+ shannon_entropy_percentage = round (shannon_entropy / 8 * 100 , 2 )
734+ shannon_percentages .append (shannon_entropy_percentage )
735+
736+ chi_square_entropy = mt .chi_square (chunk )
737+ chi_square_max = 256 * (len (chunk ) - 1 )
738+ chisquare_entropy_percentage = round (
739+ (chi_square_entropy / chi_square_max ) * 100 , 2
740+ )
741+ chi_square_percentages .append (chisquare_entropy_percentage )
742+
743+ entropy_report = EntropyReport (
744+ shannon = EntropyMeasurements (
745+ percentages = shannon_percentages ,
746+ block_size = block_size ,
747+ mean = mean (shannon_percentages ),
748+ ),
749+ chi_square = EntropyMeasurements (
750+ percentages = chi_square_percentages ,
751+ block_size = block_size ,
752+ mean = mean (chi_square_percentages ),
753+ ),
740754 )
741755
742756 logger .debug (
743- "Entropy calculated" ,
757+ "Shannon entropy calculated" ,
758+ path = path ,
759+ size = file_size ,
760+ block_size = entropy_report .shannon .block_size ,
761+ mean = round (entropy_report .shannon .mean , 2 ),
762+ highest = round (entropy_report .shannon .highest , 2 ),
763+ lowest = round (entropy_report .shannon .lowest , 2 ),
764+ )
765+ logger .debug (
766+ "Chi square entropy calculated" ,
744767 path = path ,
745768 size = file_size ,
746- block_size = report .block_size ,
747- mean = round (report .mean , 2 ),
748- highest = round (report .highest , 2 ),
749- lowest = round (report .lowest , 2 ),
769+ block_size = entropy_report . chi_square .block_size ,
770+ mean = round (entropy_report . chi_square .mean , 2 ),
771+ highest = round (entropy_report . chi_square .highest , 2 ),
772+ lowest = round (entropy_report . chi_square .lowest , 2 ),
750773 )
751774
752- return report
775+ return entropy_report
753776
754777
755778def calculate_block_size (
@@ -763,23 +786,39 @@ def calculate_block_size(
763786 return block_size # noqa: RET504
764787
765788
766- def format_entropy_plot (percentages : List [float ], block_size : int ):
789+ def format_entropy_plot (report : EntropyReport ):
790+ # start from scratch
791+ plt .clear_figure ()
792+ # go colorless
793+ plt .clear_color ()
794+ plt .title ("Shannon Entropy distribution" )
795+ plt .xlabel (f"{ report .shannon .block_size } bytes" )
796+
797+ plt .plot (report .shannon .percentages , label = "Shannon (%)" )
798+ # 16 height leaves no gaps between the lines
799+ plt .plot_size (100 , 16 )
800+
801+ # Draw ticks every 1Mb on the x axis.
802+ plt .xticks (range (len (report .shannon .percentages ) + 1 ))
803+ # zoom into the area where our data lives
804+ plt .ylim (report .shannon .lowest , report .shannon .highest + 1 )
805+
806+ shannon_plot = plt .build ()
807+
767808 # start from scratch
768809 plt .clear_figure ()
769810 # go colorless
770811 plt .clear_color ()
771- plt .title ("Entropy distribution" )
772- # plt.xlabel(humanize.naturalsize(block_size))
773- plt .xlabel (f"{ block_size } bytes" )
774- plt .ylabel ("entropy %" )
812+ plt .title ("Chi Square Entropy distribution" )
813+ plt .xlabel (f"{ report .chi_square .block_size } bytes" )
775814
776- plt .scatter ( percentages , marker = "dot " )
815+ plt .plot ( report . chi_square . percentages , label = "Chi square (%) " )
777816 # 16 height leaves no gaps between the lines
778817 plt .plot_size (100 , 16 )
779- plt .ylim (0 , 100 )
780818 # Draw ticks every 1Mb on the x axis.
781- plt .xticks (range (len (percentages ) + 1 ))
782- # Always show 0% and 100%
783- plt .yticks ( range ( 0 , 101 , 10 ) )
819+ plt .xticks (range (len (report . chi_square . percentages ) + 1 ))
820+ # zoom into the area where our data lives
821+ plt .ylim ( report . chi_square . lowest , report . chi_square . highest + 1 )
784822
785- return plt .build ()
823+ chi_square_plot = plt .build ()
824+ return "\n " + shannon_plot + "\n " + chi_square_plot
0 commit comments