forked from sstadick/cargo-bundle-licenses
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiscovery.rs
180 lines (158 loc) · 5.33 KB
/
discovery.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
use std::{collections::HashMap, fs, path::PathBuf};
use cargo_metadata::Package;
use regex::Regex;
use slug::slugify;
use thiserror::Error;
use crate::license::License;
const HIGH_CONFIDENCE_LIMIT: f32 = 0.10;
const LOW_CONFIDENCE_LIMIT: f32 = 0.15;
#[derive(Debug, Error)]
pub enum DiscoveryError {
#[error(transparent)]
Io(#[from] std::io::Error),
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub enum Confidence {
MultiplePossibleLicenseFiles,
MissingLicenseFile,
Confident,
SemiConfident,
Unsure,
NoTemplate,
UnspecifiedLicenseInPackage,
}
#[derive(Debug)]
pub struct LicenseText {
pub path: PathBuf,
pub text: String,
pub confidence: Confidence,
}
fn add_frequencies(freq: &mut HashMap<String, u32>, text: &str) {
for word in Regex::new(r"\w+").unwrap().find_iter(text) {
*freq
.entry(word.as_str().to_lowercase().clone())
.or_insert(0) += 1;
}
}
fn calculate_frequency(text: &str) -> HashMap<String, u32> {
let mut freq = HashMap::new();
add_frequencies(&mut freq, text);
freq
}
fn compare(mut text_freq: HashMap<String, u32>, template_freq: &HashMap<String, u32>) -> u32 {
let mut errors = 0;
for (word, &count) in template_freq {
let text_count = text_freq.remove(word).unwrap_or(0);
let diff = ((text_count as i32) - (count as i32)).abs() as u32;
errors += diff;
}
for (_, count) in text_freq {
errors += count;
}
errors
}
fn check_against_template(text: &str, license: &License) -> Confidence {
let text_freq = calculate_frequency(text);
let template_freq = if let License::Multiple(ref licenses) = *license {
let mut template_freq = HashMap::new();
for license in licenses {
if let Some(template) = license.template() {
add_frequencies(&mut template_freq, template);
} else {
return Confidence::NoTemplate;
}
}
template_freq
} else if let Some(template) = license.template() {
calculate_frequency(template)
} else {
return Confidence::NoTemplate;
};
let total: u32 = template_freq.values().sum();
let errors = compare(text_freq, &template_freq);
let score = (errors as f32) / (total as f32);
if score < HIGH_CONFIDENCE_LIMIT {
Confidence::Confident
} else if score < LOW_CONFIDENCE_LIMIT {
Confidence::SemiConfident
} else {
Confidence::Unsure
}
}
pub fn find_package_license(
package: &Package,
license: &License,
) -> Result<Vec<LicenseText>, DiscoveryError> {
/// Is this a generic license name
fn generic_license_name(name: &str) -> bool {
name.to_uppercase() == "LICENSE"
|| name.to_uppercase() == "LICENCE"
|| name.to_uppercase() == "LICENSE.MD"
|| name.to_uppercase() == "LICENSE.TXT"
|| name.to_uppercase() == "COPYING"
}
fn name_matches(name: &str, license: &License) -> bool {
let name = slugify(name).to_lowercase();
match *license {
License::Custom(ref custom) => {
let custom = slugify(custom).to_lowercase();
name == custom
|| name == format!("license-{}", custom)
|| name == format!("license-{}-md", custom)
|| name == format!("license-{}-txt", custom)
|| name == format!("{}-license", custom)
|| name == format!("{}-license-md", custom)
|| name == format!("{}-license-txt", custom)
}
ref license => {
let mut found = false;
for lic in license.synonyms() {
if name == lic
|| name == format!("license-{}", lic)
|| name == format!("license-{}-md", lic)
|| name == format!("license-{}-txt", lic)
|| name == format!("{}-license", lic)
|| name == format!("{}-license-md", lic)
|| name == format!("{}-license-txt", lic)
{
found = true;
break;
}
}
found
}
}
}
let mut generic = None;
let mut texts = vec![];
for entry in fs::read_dir(package.manifest_path.parent().unwrap())? {
let entry = entry?;
let path = entry.path().clone();
let name = entry.file_name().to_string_lossy().into_owned();
if name_matches(&name, license) {
if let Ok(text) = fs::read_to_string(&path) {
let confidence = check_against_template(&text, license);
texts.push(LicenseText {
path,
text,
confidence,
});
}
} else if generic_license_name(&name) {
if let Ok(text) = fs::read_to_string(&path) {
let confidence = check_against_template(&text, license);
generic = Some(LicenseText {
path,
text,
confidence,
});
}
}
}
if texts.is_empty() {
if let Some(generic) = generic {
texts.push(generic);
}
}
Ok(texts)
}