Skip to content

Commit 65c20e2

Browse files
committed
feat(math): introduce Chi Square entropy in EntropyReport
Introduce another entropy measure based on Chi Square. This entropy measure is introduced by modifying the EntropyReport class so that it contains two EntropyMeasures: - shannon: for Shannon entropy, which was already there - chi_square: for Chi Square entropy, which we introduce The format_entropy_plot has been adjusted to display two graphs. One for Shannon, the other for Chi Square.
1 parent 73b16e1 commit 65c20e2

File tree

3 files changed

+111
-46
lines changed

3 files changed

+111
-46
lines changed

tests/test_processing.py

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import sys
33
import zipfile
44
from pathlib import Path
5+
from statistics import mean
56
from typing import Collection, List, Optional, Tuple, Type, TypeVar
67

78
import attr
@@ -33,6 +34,7 @@
3334
)
3435
from unblob.report import (
3536
ChunkReport,
37+
EntropyMeasurements,
3638
EntropyReport,
3739
ExtractDirectoryExistsReport,
3840
FileMagicReport,
@@ -199,7 +201,15 @@ def test_calculate_block_size(
199201

200202
def test_format_entropy_plot_error():
201203
with pytest.raises(TypeError):
202-
format_entropy_plot(percentages=[], block_size=1024)
204+
format_entropy_plot(
205+
EntropyReport(
206+
shannon=EntropyMeasurements(percentages=[], block_size=1024),
207+
chi_square=EntropyMeasurements(
208+
percentages=[],
209+
block_size=1024,
210+
),
211+
)
212+
)
203213

204214

205215
@pytest.mark.parametrize(
@@ -215,17 +225,20 @@ def test_format_entropy_plot_error():
215225
)
216226
def test_format_entropy_plot_no_exception(percentages: List[float], block_size: int):
217227
assert str(block_size) in format_entropy_plot(
218-
percentages=percentages,
219-
block_size=block_size,
228+
EntropyReport(
229+
shannon=EntropyMeasurements(
230+
percentages=percentages, block_size=block_size, mean=mean(percentages)
231+
),
232+
chi_square=EntropyMeasurements(
233+
percentages=percentages, block_size=block_size, mean=mean(percentages)
234+
),
235+
)
220236
)
221237

222238

223239
def test_calculate_entropy_no_exception():
224240
report = calculate_entropy(Path(sys.executable))
225-
format_entropy_plot(
226-
percentages=report.percentages,
227-
block_size=report.block_size,
228-
)
241+
format_entropy_plot(report)
229242

230243

231244
@pytest.mark.parametrize(
@@ -434,17 +447,24 @@ def get_all(file_name, report_type: Type[ReportType]) -> List[ReportType]:
434447
assert (
435448
unknown_entropy is not None
436449
) # removes pyright complaints for the below lines :(
437-
assert unknown_entropy.percentages == [0.0, 75.0] + [100.0] * 62
438-
assert unknown_entropy.block_size == 1024
439-
assert round(unknown_entropy.mean, 2) == 98.05 # noqa: PLR2004
440-
assert unknown_entropy.highest == 100.0 # noqa: PLR2004
441-
assert unknown_entropy.lowest == 0.0
450+
assert unknown_entropy.shannon.percentages == [0.0, 75.0] + [100.0] * 62
451+
assert unknown_entropy.shannon.block_size == 1024
452+
assert round(unknown_entropy.shannon.mean, 2) == 98.05 # noqa: PLR2004
453+
assert unknown_entropy.shannon.highest == 100.0 # noqa: PLR2004
454+
assert unknown_entropy.shannon.lowest == 0.0
442455

443456
# we should have entropy calculated for files without extractions, except for empty files
444457
assert get_all("empty.txt", EntropyReport) == []
445-
assert [EntropyReport(percentages=[100.0], block_size=1024, mean=100.0)] == get_all(
446-
"0-255.bin", EntropyReport
447-
)
458+
assert [
459+
EntropyReport(
460+
shannon=EntropyMeasurements(
461+
percentages=[100.0], block_size=1024, mean=100.0
462+
),
463+
chi_square=EntropyMeasurements(
464+
percentages=[0.0], block_size=1024, mean=0.0
465+
),
466+
)
467+
] == get_all("0-255.bin", EntropyReport)
448468

449469

450470
@pytest.mark.parametrize(

unblob/processing.py

Lines changed: 69 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import shutil
33
from operator import attrgetter
44
from pathlib import Path
5+
from statistics import mean
56
from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type, Union
67

78
import attr
@@ -34,6 +35,7 @@
3435
from .pool import make_pool
3536
from .report import (
3637
CalculateMultiFileExceptionReport,
38+
EntropyMeasurements,
3739
EntropyReport,
3840
ExtractDirectoryExistsReport,
3941
FileMagicReport,
@@ -563,8 +565,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
563565
logger.debug(
564566
"Entropy chart",
565567
# New line so that chart title will be aligned correctly in the next line
566-
chart="\n"
567-
+ format_entropy_plot(report.percentages, report.block_size),
568+
chart="\n" + format_entropy_plot(report),
568569
path=path,
569570
_verbosity=3,
570571
)
@@ -709,7 +710,8 @@ def calculate_entropy(path: Path) -> EntropyReport:
709710
can contain 0-8 bits of entropy. We normalize it for visualization to a
710711
0-100% scale, to make it easier to interpret the graph.
711712
"""
712-
percentages = []
713+
shannon_percentages = []
714+
chi_square_percentages = []
713715

714716
# We could use the chunk size instead of another syscall,
715717
# but we rely on the actual file size written to the disk
@@ -725,31 +727,52 @@ def calculate_entropy(path: Path) -> EntropyReport:
725727
max_limit=1024 * 1024,
726728
)
727729

728-
entropy_sum = 0.0
729730
with File.from_path(path) as file:
730731
for chunk in iterate_file(file, 0, file_size, buffer_size=block_size):
731-
entropy = mt.shannon_entropy(chunk)
732-
entropy_percentage = round(entropy / 8 * 100, 2)
733-
percentages.append(entropy_percentage)
734-
entropy_sum += entropy * len(chunk)
735-
736-
report = EntropyReport(
737-
percentages=percentages,
738-
block_size=block_size,
739-
mean=entropy_sum / file_size / 8 * 100,
732+
shannon_entropy = mt.shannon_entropy(chunk)
733+
shannon_entropy_percentage = round(shannon_entropy / 8 * 100, 2)
734+
shannon_percentages.append(shannon_entropy_percentage)
735+
736+
chi_square_entropy = mt.chi_square(chunk)
737+
chi_square_max = 256 * (len(chunk) - 1)
738+
chisquare_entropy_percentage = round(
739+
(chi_square_entropy / chi_square_max) * 100, 2
740+
)
741+
chi_square_percentages.append(chisquare_entropy_percentage)
742+
743+
entropy_report = EntropyReport(
744+
shannon=EntropyMeasurements(
745+
percentages=shannon_percentages,
746+
block_size=block_size,
747+
mean=mean(shannon_percentages),
748+
),
749+
chi_square=EntropyMeasurements(
750+
percentages=chi_square_percentages,
751+
block_size=block_size,
752+
mean=mean(chi_square_percentages),
753+
),
740754
)
741755

742756
logger.debug(
743-
"Entropy calculated",
757+
"Shannon entropy calculated",
758+
path=path,
759+
size=file_size,
760+
block_size=entropy_report.shannon.block_size,
761+
mean=round(entropy_report.shannon.mean, 2),
762+
highest=round(entropy_report.shannon.highest, 2),
763+
lowest=round(entropy_report.shannon.lowest, 2),
764+
)
765+
logger.debug(
766+
"Chi square entropy calculated",
744767
path=path,
745768
size=file_size,
746-
block_size=report.block_size,
747-
mean=round(report.mean, 2),
748-
highest=round(report.highest, 2),
749-
lowest=round(report.lowest, 2),
769+
block_size=entropy_report.chi_square.block_size,
770+
mean=round(entropy_report.chi_square.mean, 2),
771+
highest=round(entropy_report.chi_square.highest, 2),
772+
lowest=round(entropy_report.chi_square.lowest, 2),
750773
)
751774

752-
return report
775+
return entropy_report
753776

754777

755778
def calculate_block_size(
@@ -763,23 +786,39 @@ def calculate_block_size(
763786
return block_size # noqa: RET504
764787

765788

766-
def format_entropy_plot(percentages: List[float], block_size: int):
789+
def format_entropy_plot(report: EntropyReport):
790+
# start from scratch
791+
plt.clear_figure()
792+
# go colorless
793+
plt.clear_color()
794+
plt.title("Shannon Entropy distribution")
795+
plt.xlabel(f"{report.shannon.block_size} bytes")
796+
797+
plt.plot(report.shannon.percentages, label="Shannon (%)")
798+
# 16 height leaves no gaps between the lines
799+
plt.plot_size(100, 16)
800+
801+
# Draw ticks every 1Mb on the x axis.
802+
plt.xticks(range(len(report.shannon.percentages) + 1))
803+
# zoom into the area where our data lives
804+
plt.ylim(report.shannon.lowest, report.shannon.highest + 1)
805+
806+
shannon_plot = plt.build()
807+
767808
# start from scratch
768809
plt.clear_figure()
769810
# go colorless
770811
plt.clear_color()
771-
plt.title("Entropy distribution")
772-
# plt.xlabel(humanize.naturalsize(block_size))
773-
plt.xlabel(f"{block_size} bytes")
774-
plt.ylabel("entropy %")
812+
plt.title("Chi Square Entropy distribution")
813+
plt.xlabel(f"{report.chi_square.block_size} bytes")
775814

776-
plt.scatter(percentages, marker="dot")
815+
plt.plot(report.chi_square.percentages, label="Chi square (%)")
777816
# 16 height leaves no gaps between the lines
778817
plt.plot_size(100, 16)
779-
plt.ylim(0, 100)
780818
# Draw ticks every 1Mb on the x axis.
781-
plt.xticks(range(len(percentages) + 1))
782-
# Always show 0% and 100%
783-
plt.yticks(range(0, 101, 10))
819+
plt.xticks(range(len(report.chi_square.percentages) + 1))
820+
# zoom into the area where our data lives
821+
plt.ylim(report.chi_square.lowest, report.chi_square.highest + 1)
784822

785-
return plt.build()
823+
chi_square_plot = plt.build()
824+
return "\n" + shannon_plot + "\n" + chi_square_plot

unblob/report.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ class FileMagicReport(Report):
191191

192192

193193
@attr.define(kw_only=True, frozen=True)
194-
class EntropyReport(Report):
194+
class EntropyMeasurements:
195195
percentages: List[float]
196196
block_size: int
197197
mean: float
@@ -205,6 +205,12 @@ def lowest(self):
205205
return min(self.percentages)
206206

207207

208+
@attr.define(kw_only=True, frozen=True)
209+
class EntropyReport(Report):
210+
shannon: EntropyMeasurements
211+
chi_square: EntropyMeasurements
212+
213+
208214
@final
209215
@attr.define(kw_only=True, frozen=True)
210216
class ChunkReport(Report):

0 commit comments

Comments
 (0)