Skip to content

Commit 1755be3

Browse files
committed
htmlDown: clean element by CSS-like element selector #log
1 parent 8e11ca4 commit 1755be3

File tree

2 files changed

+98
-10
lines changed

2 files changed

+98
-10
lines changed

src/main/java/net/cofcool/sourcebox/internal/HtmlDownloader.java

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,12 @@ public class HtmlDownloader implements Tool {
5151
private static final Map<String, Function<Element, String>> tagMap = new HashMap<>();
5252

5353
private static final String IMGS_FOLDER = "imgs";
54+
private static Replacer REPLACER;
5455
private final Set<String> history = new HashSet<>();
5556

5657
private int depth;
5758
private boolean clean;
59+
private String cleanexp;
5860
private Set<OutputType> outputTypes = EnumSet.of(OutputType.html);
5961
private Proxy proxy;
6062
private String filter;
@@ -87,6 +89,8 @@ public void run(Args args) throws Exception {
8789
.collect(Collectors.toSet());
8890
var img = args.readArg("img").val();
8991
filter = args.readArg("filter").getVal().orElse(null);
92+
REPLACER = new Replacer(args.readArg("replace").getVal().orElse(null));
93+
cleanexp = args.readArg("cleanexp").getVal().orElse(null);
9094

9195
if (urls.isEmpty()) {
9296
throw new IllegalArgumentException("Do not find any url");
@@ -122,14 +126,18 @@ private Connection getConnection() {
122126
private static void toMarkdown(Document body, String folder, String title) throws IOException {
123127
var md = new ArrayList<String>();
124128
convertTagToMd(body.body(), md);
125-
FileUtils.writeLines(Paths.get(folder, title + ".md").toFile(), md);
129+
FileUtils.writeStringToFile(
130+
Paths.get(folder, title + ".md").toFile(),
131+
REPLACER.replace(String.join("\n", md)),
132+
StandardCharsets.UTF_8
133+
);
126134
}
127135

128136
private static void toPlainText(Document body, String folder, String title) throws IOException {
129137
var cleaner = new Cleaner(Safelist.none());
130138
FileUtils.writeStringToFile(
131139
Paths.get(folder, title + ".txt").toFile(),
132-
cleaner.clean(body).wholeText(),
140+
REPLACER.replace(cleaner.clean(body).wholeText()),
133141
StandardCharsets.UTF_8
134142
);
135143
}
@@ -177,17 +185,21 @@ private void downloadUrl(String folder, String url, int depth, String expression
177185
context.write(String.format("Create dir %s", dir));
178186
}
179187

180-
if (clean) {
181-
doc.getElementsByTag("script").remove();
182-
doc.getElementsByTag("style").remove();
183-
doc.getElementsByTag("meta").remove();
184-
}
185-
186188
downloadImages(doc.getElementsByTag("img"), folder, expression);
187189

188190
if (filter == null || title.contains(filter)) {
191+
var out = doc.clone();
192+
if (clean) {
193+
out.getElementsByTag("script").remove();
194+
out.getElementsByTag("style").remove();
195+
out.getElementsByTag("link").remove();
196+
out.getElementsByTag("meta").remove();
197+
}
198+
if (StringUtils.isNotBlank(cleanexp)) {
199+
out.select(cleanexp).remove();
200+
}
189201
for (OutputType type : outputTypes) {
190-
type.applyOutput(doc, folder, title);
202+
type.applyOutput(out, folder, title);
191203
context.write(String.format("Save %s file to %s from <<%s>>: %s", type, folder, title, url));
192204
}
193205
}
@@ -264,9 +276,48 @@ public Args config() {
264276
.arg(new Arg("proxy", null, "request proxy", false, "127.0.0.1:8087"))
265277
.arg(new Arg("out", "./", "output folder", false, null))
266278
.arg(new Arg("clean", "false", "remove css or javascript", false, null))
279+
.arg(new Arg("cleanexp", null, "clean element by CSS-like element selector", false, "a[href]"))
280+
.arg(new Arg("replace", null, "replace some text", false, "test+"))
267281
.runnerTypes(EnumSet.allOf(RunnerType.class));
268282
}
269283

284+
private record Expression(String tag, Map<String, String> attributes) { }
285+
286+
// tag:a,attr:ref=1&ref1=2;
287+
static List<Map<String, Expression>> parseExp(String args) {
288+
List<Map<String, Expression>> result = new ArrayList<>();
289+
for (String input : args.split(";")) {
290+
String[] keyValuePairs = input.split(",");
291+
var pars = new HashMap<String, Expression>();
292+
for (String pair : keyValuePairs) {
293+
String[] keyValue = pair.split(":");
294+
if (keyValue.length == 2) {
295+
String key = keyValue[0];
296+
pars.put(pair, new Expression(key, parseAttr(keyValue[1])));
297+
} else {
298+
throw new IllegalArgumentException("Invalid expression: " + input);
299+
}
300+
}
301+
result.add(pars);
302+
}
303+
304+
return result;
305+
}
306+
307+
private static Map<String, String> parseAttr(String value) {
308+
Map<String, String> subMap = new HashMap<>();
309+
String[] subPairs = value.split("&");
310+
for (String subPair : subPairs) {
311+
String[] subKeyValue = subPair.split("=");
312+
if (subKeyValue.length == 2) {
313+
subMap.put(subKeyValue[0], subKeyValue[1]);
314+
} else {
315+
subMap.put(subPair, subPair);
316+
}
317+
}
318+
return subMap;
319+
}
320+
270321
private static String checkText(Element e, String out) {
271322
return StringUtils.isEmpty(e.text()) ? null : out;
272323
}
@@ -277,7 +328,7 @@ enum OutputType {
277328
if (file.exists()) {
278329
return;
279330
}
280-
FileUtils.writeStringToFile(file, d.outerHtml(), StandardCharsets.UTF_8);
331+
FileUtils.writeStringToFile(file, REPLACER.replace(d.outerHtml()), StandardCharsets.UTF_8);
281332
}),
282333
txt(HtmlDownloader::toPlainText),
283334
markdown(HtmlDownloader::toMarkdown),
@@ -299,6 +350,17 @@ private void finished() {
299350

300351
}
301352

353+
private record Replacer(String regex) {
354+
355+
String replace(String text) {
356+
if (StringUtils.isEmpty(regex)) {
357+
return text;
358+
}
359+
return text.replaceAll(regex, "");
360+
}
361+
362+
}
363+
302364
private interface Output {
303365

304366
void write(Document body, String folder, String title) throws IOException;

src/test/java/net/cofcool/sourcebox/internal/HtmlDownloaderTest.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,32 @@ void runWithTxt() throws Exception {
8989
);
9090
}
9191

92+
@Test
93+
void runWithReplace() throws Exception {
94+
instance().run(args
95+
.arg("url", url)
96+
.arg("out", "./target/HtmlDownloaderTest")
97+
.arg("replace", "<img.+*>")
98+
.arg("outType", OutputType.html.name())
99+
);
100+
}
101+
102+
@Test
103+
void testParseExp() throws Exception {
104+
var a = HtmlDownloader.parseExp("tag:a,attr:ref=1&ref1=2;tag:b,attr:ref=1&ref1=2;");
105+
System.out.println(a);
106+
}
107+
108+
@Test
109+
void runWithCleanexp() throws Exception {
110+
instance().run(args
111+
.arg("url", url)
112+
.arg("out", "./target/HtmlDownloaderTest")
113+
.arg("cleanexp", "img")
114+
.arg("outType", OutputType.html.name())
115+
);
116+
}
117+
92118
@Test
93119
void runWithEpub() throws Exception {
94120
instance().run(args

0 commit comments

Comments
 (0)