@@ -51,10 +51,12 @@ public class HtmlDownloader implements Tool {
51
51
private static final Map <String , Function <Element , String >> tagMap = new HashMap <>();
52
52
53
53
private static final String IMGS_FOLDER = "imgs" ;
54
+ private static Replacer REPLACER ;
54
55
private final Set <String > history = new HashSet <>();
55
56
56
57
private int depth ;
57
58
private boolean clean ;
59
+ private String cleanexp ;
58
60
private Set <OutputType > outputTypes = EnumSet .of (OutputType .html );
59
61
private Proxy proxy ;
60
62
private String filter ;
@@ -87,6 +89,8 @@ public void run(Args args) throws Exception {
87
89
.collect (Collectors .toSet ());
88
90
var img = args .readArg ("img" ).val ();
89
91
filter = args .readArg ("filter" ).getVal ().orElse (null );
92
+ REPLACER = new Replacer (args .readArg ("replace" ).getVal ().orElse (null ));
93
+ cleanexp = args .readArg ("cleanexp" ).getVal ().orElse (null );
90
94
91
95
if (urls .isEmpty ()) {
92
96
throw new IllegalArgumentException ("Do not find any url" );
@@ -122,14 +126,18 @@ private Connection getConnection() {
122
126
private static void toMarkdown (Document body , String folder , String title ) throws IOException {
123
127
var md = new ArrayList <String >();
124
128
convertTagToMd (body .body (), md );
125
- FileUtils .writeLines (Paths .get (folder , title + ".md" ).toFile (), md );
129
+ FileUtils .writeStringToFile (
130
+ Paths .get (folder , title + ".md" ).toFile (),
131
+ REPLACER .replace (String .join ("\n " , md )),
132
+ StandardCharsets .UTF_8
133
+ );
126
134
}
127
135
128
136
private static void toPlainText (Document body , String folder , String title ) throws IOException {
129
137
var cleaner = new Cleaner (Safelist .none ());
130
138
FileUtils .writeStringToFile (
131
139
Paths .get (folder , title + ".txt" ).toFile (),
132
- cleaner .clean (body ).wholeText (),
140
+ REPLACER . replace ( cleaner .clean (body ).wholeText () ),
133
141
StandardCharsets .UTF_8
134
142
);
135
143
}
@@ -177,17 +185,21 @@ private void downloadUrl(String folder, String url, int depth, String expression
177
185
context .write (String .format ("Create dir %s" , dir ));
178
186
}
179
187
180
- if (clean ) {
181
- doc .getElementsByTag ("script" ).remove ();
182
- doc .getElementsByTag ("style" ).remove ();
183
- doc .getElementsByTag ("meta" ).remove ();
184
- }
185
-
186
188
downloadImages (doc .getElementsByTag ("img" ), folder , expression );
187
189
188
190
if (filter == null || title .contains (filter )) {
191
+ var out = doc .clone ();
192
+ if (clean ) {
193
+ out .getElementsByTag ("script" ).remove ();
194
+ out .getElementsByTag ("style" ).remove ();
195
+ out .getElementsByTag ("link" ).remove ();
196
+ out .getElementsByTag ("meta" ).remove ();
197
+ }
198
+ if (StringUtils .isNotBlank (cleanexp )) {
199
+ out .select (cleanexp ).remove ();
200
+ }
189
201
for (OutputType type : outputTypes ) {
190
- type .applyOutput (doc , folder , title );
202
+ type .applyOutput (out , folder , title );
191
203
context .write (String .format ("Save %s file to %s from <<%s>>: %s" , type , folder , title , url ));
192
204
}
193
205
}
@@ -264,9 +276,48 @@ public Args config() {
264
276
.arg (new Arg ("proxy" , null , "request proxy" , false , "127.0.0.1:8087" ))
265
277
.arg (new Arg ("out" , "./" , "output folder" , false , null ))
266
278
.arg (new Arg ("clean" , "false" , "remove css or javascript" , false , null ))
279
+ .arg (new Arg ("cleanexp" , null , "clean element by CSS-like element selector" , false , "a[href]" ))
280
+ .arg (new Arg ("replace" , null , "replace some text" , false , "test+" ))
267
281
.runnerTypes (EnumSet .allOf (RunnerType .class ));
268
282
}
269
283
284
+ private record Expression (String tag , Map <String , String > attributes ) { }
285
+
286
+ // tag:a,attr:ref=1&ref1=2;
287
+ static List <Map <String , Expression >> parseExp (String args ) {
288
+ List <Map <String , Expression >> result = new ArrayList <>();
289
+ for (String input : args .split (";" )) {
290
+ String [] keyValuePairs = input .split ("," );
291
+ var pars = new HashMap <String , Expression >();
292
+ for (String pair : keyValuePairs ) {
293
+ String [] keyValue = pair .split (":" );
294
+ if (keyValue .length == 2 ) {
295
+ String key = keyValue [0 ];
296
+ pars .put (pair , new Expression (key , parseAttr (keyValue [1 ])));
297
+ } else {
298
+ throw new IllegalArgumentException ("Invalid expression: " + input );
299
+ }
300
+ }
301
+ result .add (pars );
302
+ }
303
+
304
+ return result ;
305
+ }
306
+
307
+ private static Map <String , String > parseAttr (String value ) {
308
+ Map <String , String > subMap = new HashMap <>();
309
+ String [] subPairs = value .split ("&" );
310
+ for (String subPair : subPairs ) {
311
+ String [] subKeyValue = subPair .split ("=" );
312
+ if (subKeyValue .length == 2 ) {
313
+ subMap .put (subKeyValue [0 ], subKeyValue [1 ]);
314
+ } else {
315
+ subMap .put (subPair , subPair );
316
+ }
317
+ }
318
+ return subMap ;
319
+ }
320
+
270
321
private static String checkText (Element e , String out ) {
271
322
return StringUtils .isEmpty (e .text ()) ? null : out ;
272
323
}
@@ -277,7 +328,7 @@ enum OutputType {
277
328
if (file .exists ()) {
278
329
return ;
279
330
}
280
- FileUtils .writeStringToFile (file , d .outerHtml (), StandardCharsets .UTF_8 );
331
+ FileUtils .writeStringToFile (file , REPLACER . replace ( d .outerHtml () ), StandardCharsets .UTF_8 );
281
332
}),
282
333
txt (HtmlDownloader ::toPlainText ),
283
334
markdown (HtmlDownloader ::toMarkdown ),
@@ -299,6 +350,17 @@ private void finished() {
299
350
300
351
}
301
352
353
+ private record Replacer (String regex ) {
354
+
355
+ String replace (String text ) {
356
+ if (StringUtils .isEmpty (regex )) {
357
+ return text ;
358
+ }
359
+ return text .replaceAll (regex , "" );
360
+ }
361
+
362
+ }
363
+
302
364
private interface Output {
303
365
304
366
void write (Document body , String folder , String title ) throws IOException ;
0 commit comments