Skip to content

Commit aeaa3d2

Browse files
committed
feat(math): introduce Chi Square entropy in EntropyReport
Introduce another entropy measure based on Chi Square probability by using unblob-native's chi_square_probability function. This function returns the Chi Square distribution probability. Chi-square tests are effective for distinguishing compressed from encrypted data because they evaluate the uniformity of byte distributions more rigorously than Shannon entropy. In compressed files, bytes often cluster around certain values due to patterns that still exist (albeit less detectable), resulting in a non-uniform distribution. Encrypted data, by contrast, exhibits nearly perfect uniformity, as each byte value from 0–255 is expected to appear with almost equal frequency, making it harder to detect any discernible patterns. The chi-square distribution is calculated for the stream of bytes in the chunk and expressed as an absolute number and a percentage which indicates how frequently a truly random sequence would exceed the value calculated. The percentage is the only value that is of interest from unblob's perspective, so that's why we only return it. According to ent doc⁰: > We [can] interpret the percentage as the degree to which the > sequence tested is suspected of being non-random. If the percentage is > greater than 99% or less than 1%, the sequence is almost certainly not > random. If the percentage is between 99% and 95% or between 1% and 5%, > the sequence is suspect. Percentages between 90% and 95% and 5% and 10% > indicate the sequence is “almost suspect”. [0] - https://www.fourmilab.ch/random/ This entropy measure is introduced by modifying the EntropyReport class so that it contains two EntropyMeasures: - shannon: for Shannon entropy, which was already there - chi_square: for Chi Square entropy, which we introduce The format_entropy_plot has been adjusted to display two lines within the entropy graph. One for Shannon, the other for Chi Square.
1 parent 5e1235b commit aeaa3d2

File tree

3 files changed

+93
-39
lines changed

3 files changed

+93
-39
lines changed

tests/test_processing.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import sys
33
import zipfile
44
from pathlib import Path
5+
from statistics import mean
56
from typing import Collection, List, Optional, Tuple, Type, TypeVar
67

78
import attr
@@ -33,6 +34,7 @@
3334
)
3435
from unblob.report import (
3536
ChunkReport,
37+
EntropyMeasurements,
3638
EntropyReport,
3739
ExtractDirectoryExistsReport,
3840
FileMagicReport,
@@ -199,7 +201,15 @@ def test_calculate_block_size(
199201

200202
def test_format_entropy_plot_error():
201203
with pytest.raises(TypeError):
202-
format_entropy_plot(percentages=[], block_size=1024)
204+
format_entropy_plot(
205+
EntropyReport(
206+
shannon=EntropyMeasurements(percentages=[], block_size=1024),
207+
chi_square=EntropyMeasurements(
208+
percentages=[],
209+
block_size=1024,
210+
),
211+
)
212+
)
203213

204214

205215
@pytest.mark.parametrize(
@@ -215,17 +225,20 @@ def test_format_entropy_plot_error():
215225
)
216226
def test_format_entropy_plot_no_exception(percentages: List[float], block_size: int):
217227
assert str(block_size) in format_entropy_plot(
218-
percentages=percentages,
219-
block_size=block_size,
228+
EntropyReport(
229+
shannon=EntropyMeasurements(
230+
percentages=percentages, block_size=block_size, mean=mean(percentages)
231+
),
232+
chi_square=EntropyMeasurements(
233+
percentages=percentages, block_size=block_size, mean=mean(percentages)
234+
),
235+
)
220236
)
221237

222238

223239
def test_calculate_entropy_no_exception():
224240
report = calculate_entropy(Path(sys.executable))
225-
format_entropy_plot(
226-
percentages=report.percentages,
227-
block_size=report.block_size,
228-
)
241+
format_entropy_plot(report)
229242

230243

231244
@pytest.mark.parametrize(
@@ -434,17 +447,29 @@ def get_all(file_name, report_type: Type[ReportType]) -> List[ReportType]:
434447
assert (
435448
unknown_entropy is not None
436449
) # removes pyright complaints for the below lines :(
437-
assert unknown_entropy.percentages == [0.0, 75.0] + [100.0] * 62
438-
assert unknown_entropy.block_size == 1024
439-
assert round(unknown_entropy.mean, 2) == 98.05 # noqa: PLR2004
440-
assert unknown_entropy.highest == 100.0 # noqa: PLR2004
441-
assert unknown_entropy.lowest == 0.0
450+
assert unknown_entropy.shannon.percentages == [0.0, 75.0] + [100.0] * 62
451+
assert unknown_entropy.shannon.block_size == 1024
452+
assert round(unknown_entropy.shannon.mean, 2) == 98.05 # noqa: PLR2004
453+
assert unknown_entropy.shannon.highest == 100.0 # noqa: PLR2004
454+
assert unknown_entropy.shannon.lowest == 0.0
455+
assert unknown_entropy.chi_square.percentages == [0.0, 0.0] + [100.0] * 62
456+
assert unknown_entropy.chi_square.block_size == 1024
457+
assert round(unknown_entropy.shannon.mean, 2) == 98.05 # noqa: PLR2004
458+
assert unknown_entropy.chi_square.highest == 100.0 # noqa: PLR2004
459+
assert unknown_entropy.chi_square.lowest == 0.0
442460

443461
# we should have entropy calculated for files without extractions, except for empty files
444462
assert get_all("empty.txt", EntropyReport) == []
445-
assert [EntropyReport(percentages=[100.0], block_size=1024, mean=100.0)] == get_all(
446-
"0-255.bin", EntropyReport
447-
)
463+
assert [
464+
EntropyReport(
465+
shannon=EntropyMeasurements(
466+
percentages=[100.0], block_size=1024, mean=100.0
467+
),
468+
chi_square=EntropyMeasurements(
469+
percentages=[100.0], block_size=1024, mean=100.0
470+
),
471+
)
472+
] == get_all("0-255.bin", EntropyReport)
448473

449474

450475
@pytest.mark.parametrize(

unblob/processing.py

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import shutil
33
from operator import attrgetter
44
from pathlib import Path
5+
from statistics import mean
56
from typing import Iterable, List, Optional, Sequence, Set, Tuple, Type, Union
67

78
import attr
@@ -34,6 +35,7 @@
3435
from .pool import make_pool
3536
from .report import (
3637
CalculateMultiFileExceptionReport,
38+
EntropyMeasurements,
3739
EntropyReport,
3840
ExtractDirectoryExistsReport,
3941
FileMagicReport,
@@ -563,8 +565,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
563565
logger.debug(
564566
"Entropy chart",
565567
# New line so that chart title will be aligned correctly in the next line
566-
chart="\n"
567-
+ format_entropy_plot(report.percentages, report.block_size),
568+
chart="\n" + format_entropy_plot(report),
568569
path=path,
569570
_verbosity=3,
570571
)
@@ -709,7 +710,8 @@ def calculate_entropy(path: Path) -> EntropyReport:
709710
can contain 0-8 bits of entropy. We normalize it for visualization to a
710711
0-100% scale, to make it easier to interpret the graph.
711712
"""
712-
percentages = []
713+
shannon_percentages = []
714+
chi_square_percentages = []
713715

714716
# We could use the chunk size instead of another syscall,
715717
# but we rely on the actual file size written to the disk
@@ -725,28 +727,46 @@ def calculate_entropy(path: Path) -> EntropyReport:
725727
max_limit=1024 * 1024,
726728
)
727729

728-
entropy_sum = 0.0
729730
with File.from_path(path) as file:
730731
for chunk in iterate_file(file, 0, file_size, buffer_size=block_size):
731-
entropy = mt.shannon_entropy(chunk)
732-
entropy_percentage = round(entropy / 8 * 100, 2)
733-
percentages.append(entropy_percentage)
734-
entropy_sum += entropy * len(chunk)
732+
shannon_entropy = mt.shannon_entropy(chunk)
733+
shannon_entropy_percentage = round(shannon_entropy / 8 * 100, 2)
734+
shannon_percentages.append(shannon_entropy_percentage)
735+
736+
chi_square_probability = mt.chi_square_probability(chunk)
737+
chisquare_entropy_percentage = round(chi_square_probability * 100, 2)
738+
chi_square_percentages.append(chisquare_entropy_percentage)
735739

736740
report = EntropyReport(
737-
percentages=percentages,
738-
block_size=block_size,
739-
mean=entropy_sum / file_size / 8 * 100,
741+
shannon=EntropyMeasurements(
742+
percentages=shannon_percentages,
743+
block_size=block_size,
744+
mean=mean(shannon_percentages),
745+
),
746+
chi_square=EntropyMeasurements(
747+
percentages=chi_square_percentages,
748+
block_size=block_size,
749+
mean=mean(chi_square_percentages),
750+
),
740751
)
741752

742753
logger.debug(
743-
"Entropy calculated",
754+
"Shannon entropy calculated",
744755
path=path,
745756
size=file_size,
746-
block_size=report.block_size,
747-
mean=round(report.mean, 2),
748-
highest=round(report.highest, 2),
749-
lowest=round(report.lowest, 2),
757+
block_size=report.shannon.block_size,
758+
mean=round(report.shannon.mean, 2),
759+
highest=round(report.shannon.highest, 2),
760+
lowest=round(report.shannon.lowest, 2),
761+
)
762+
logger.debug(
763+
"Chi square entropy calculated",
764+
path=path,
765+
size=file_size,
766+
block_size=report.chi_square.block_size,
767+
mean=round(report.chi_square.mean, 2),
768+
highest=round(report.chi_square.highest, 2),
769+
lowest=round(report.chi_square.lowest, 2),
750770
)
751771

752772
return report
@@ -763,22 +783,25 @@ def calculate_block_size(
763783
return block_size # noqa: RET504
764784

765785

766-
def format_entropy_plot(percentages: List[float], block_size: int):
786+
def format_entropy_plot(report: EntropyReport):
767787
# start from scratch
768788
plt.clear_figure()
769789
# go colorless
770790
plt.clear_color()
771791
plt.title("Entropy distribution")
772-
# plt.xlabel(humanize.naturalsize(block_size))
773-
plt.xlabel(f"{block_size} bytes")
774-
plt.ylabel("entropy %")
792+
plt.xlabel(f"{report.shannon.block_size} bytes")
775793

776-
plt.scatter(percentages, marker="dot")
794+
plt.plot(report.shannon.percentages, label="Shannon entropy (%)", marker="dot")
795+
plt.plot(
796+
report.chi_square.percentages,
797+
label="Chi square probability (%)",
798+
marker="cross",
799+
)
777800
# 16 height leaves no gaps between the lines
778801
plt.plot_size(100, 16)
779-
plt.ylim(0, 100)
802+
780803
# Draw ticks every 1Mb on the x axis.
781-
plt.xticks(range(len(percentages) + 1))
804+
plt.xticks(range(len(report.shannon.percentages) + 1))
782805
# Always show 0% and 100%
783806
plt.yticks(range(0, 101, 10))
784807

unblob/report.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ class FileMagicReport(Report):
191191

192192

193193
@attr.define(kw_only=True, frozen=True)
194-
class EntropyReport(Report):
194+
class EntropyMeasurements:
195195
percentages: List[float]
196196
block_size: int
197197
mean: float
@@ -205,6 +205,12 @@ def lowest(self):
205205
return min(self.percentages)
206206

207207

208+
@attr.define(kw_only=True, frozen=True)
209+
class EntropyReport(Report):
210+
shannon: EntropyMeasurements
211+
chi_square: EntropyMeasurements
212+
213+
208214
@final
209215
@attr.define(kw_only=True, frozen=True)
210216
class ChunkReport(Report):

0 commit comments

Comments
 (0)