forked from Pissandshittium/pissandshittium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdifference_estimator.cc
130 lines (101 loc) · 3.32 KB
/
difference_estimator.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// Copyright 2009 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// We want to measure the similarity of two sequences of bytes as a surrogate
// for measuring how well a second sequence will compress differentially to the
// first sequence.
#include "courgette/difference_estimator.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <unordered_set>
namespace courgette {
// Our difference measure is the number of k-tuples that occur in Subject that
// don't occur in Base.
const int kTupleSize = 4;
namespace {
static_assert(kTupleSize >= 4 && kTupleSize <= 8,
"kTupleSize should be between 4 and 8");
size_t HashTuple(const uint8_t* source) {
size_t hash1 = *reinterpret_cast<const uint32_t*>(source);
size_t hash2 = *reinterpret_cast<const uint32_t*>(source + kTupleSize - 4);
size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
return hash;
}
bool RegionsEqual(const Region& a, const Region& b) {
if (a.length() != b.length())
return false;
return memcmp(a.start(), b.start(), a.length()) == 0;
}
} // anonymous namepace
class DifferenceEstimator::Base {
public:
explicit Base(const Region& region) : region_(region) { }
Base(const Base&) = delete;
Base& operator=(const Base&) = delete;
void Init() {
if (region_.length() < kTupleSize)
return;
const uint8_t* start = region_.start();
const uint8_t* end = region_.end() - (kTupleSize - 1);
for (const uint8_t* p = start; p < end; ++p) {
size_t hash = HashTuple(p);
hashes_.insert(hash);
}
}
const Region& region() const { return region_; }
private:
Region region_;
std::unordered_set<size_t> hashes_;
friend class DifferenceEstimator;
};
class DifferenceEstimator::Subject {
public:
explicit Subject(const Region& region) : region_(region) {}
Subject(const Subject&) = delete;
Subject& operator=(const Subject&) = delete;
const Region& region() const { return region_; }
private:
Region region_;
};
DifferenceEstimator::DifferenceEstimator() = default;
DifferenceEstimator::~DifferenceEstimator() {
for (size_t i = 0; i < owned_bases_.size(); ++i)
delete owned_bases_[i];
for (size_t i = 0; i < owned_subjects_.size(); ++i)
delete owned_subjects_[i];
}
DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
Base* base = new Base(region);
base->Init();
owned_bases_.push_back(base);
return base;
}
DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
const Region& region) {
Subject* subject = new Subject(region);
owned_subjects_.push_back(subject);
return subject;
}
size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
size_t mismatches = 0;
if (subject->region().length() >= kTupleSize) {
const uint8_t* start = subject->region().start();
const uint8_t* end = subject->region().end() - (kTupleSize - 1);
const uint8_t* p = start;
while (p < end) {
size_t hash = HashTuple(p);
if (base->hashes_.find(hash) == base->hashes_.end()) {
++mismatches;
}
p += 1;
}
}
if (mismatches == 0) {
if (RegionsEqual(base->region(), subject->region()))
return 0;
}
++mismatches; // Guarantee not zero.
return mismatches;
}
} // namespace