6
6
import java .io .FileOutputStream ;
7
7
import java .io .IOException ;
8
8
import java .net .InetSocketAddress ;
9
+ import java .net .MalformedURLException ;
9
10
import java .net .Proxy ;
10
11
import java .net .Proxy .Type ;
12
+ import java .net .URL ;
11
13
import java .nio .charset .StandardCharsets ;
12
14
import java .nio .file .Paths ;
15
+ import java .time .Duration ;
13
16
import java .util .ArrayList ;
14
17
import java .util .Arrays ;
15
18
import java .util .Base64 ;
18
21
import java .util .HashSet ;
19
22
import java .util .List ;
20
23
import java .util .Map ;
24
+ import java .util .Objects ;
25
+ import java .util .Optional ;
21
26
import java .util .Set ;
22
27
import java .util .function .Function ;
23
28
import java .util .stream .Collectors ;
44
49
import org .jsoup .safety .Cleaner ;
45
50
import org .jsoup .safety .Safelist ;
46
51
import org .jsoup .select .Elements ;
52
+ import org .openqa .selenium .By ;
53
+ import org .openqa .selenium .WebDriver ;
54
+ import org .openqa .selenium .chrome .ChromeDriver ;
55
+ import org .openqa .selenium .chrome .ChromeOptions ;
56
+ import org .openqa .selenium .support .ui .ExpectedConditions ;
57
+ import org .openqa .selenium .support .ui .WebDriverWait ;
47
58
48
59
@ CustomLog
49
60
public class HtmlDownloader implements Tool {
@@ -60,9 +71,13 @@ public class HtmlDownloader implements Tool {
60
71
private Set <OutputType > outputTypes = EnumSet .of (OutputType .html );
61
72
private Proxy proxy ;
62
73
private String filter ;
74
+ private Optional <String > webDriver ;
63
75
private ToolContext context ;
76
+ private Args args ;
64
77
65
78
private Connection connection ;
79
+ private WebDriver driver ;
80
+
66
81
67
82
@ Override
68
83
public ToolName name () {
@@ -73,6 +88,7 @@ public ToolName name() {
73
88
public void run (Args args ) throws Exception {
74
89
var urls = new ArrayList <String >();
75
90
context = args .getContext ();
91
+ this .args = args ;
76
92
77
93
args .readArg ("urlFile" ).ifPresent (a -> {
78
94
try {
@@ -89,6 +105,7 @@ public void run(Args args) throws Exception {
89
105
.collect (Collectors .toSet ());
90
106
var img = args .readArg ("img" ).val ();
91
107
filter = args .readArg ("filter" ).getVal ().orElse (null );
108
+ webDriver = args .readArg ("webDriver" ).getVal ().filter (a -> !a .isBlank ());
92
109
REPLACER = new Replacer (args .readArg ("replace" ).getVal ().orElse (null ));
93
110
cleanexp = args .readArg ("cleanexp" ).getVal ().orElse (null );
94
111
@@ -104,17 +121,28 @@ public void run(Args args) throws Exception {
104
121
});
105
122
depth = Integer .parseInt (args .readArg ("depth" ).val ());
106
123
107
- for (String url : urls ) {
108
- downloadUrl (out , url , depth , img );
124
+ try {
125
+ for (String url : urls ) {
126
+ downloadUrl (out , url , depth , img );
127
+ }
128
+ } finally {
129
+ release ();
109
130
}
110
131
132
+
111
133
history .clear ();
112
134
113
135
for (OutputType type : outputTypes ) {
114
136
type .finished ();
115
137
}
116
138
}
117
139
140
+ private void release () {
141
+ if (driver != null ) {
142
+ driver .quit ();
143
+ }
144
+ }
145
+
118
146
private Connection getConnection () {
119
147
if (connection == null ) {
120
148
connection = Jsoup .newSession ().proxy (proxy );
@@ -159,6 +187,33 @@ private static void convertTagToMd(Element element, ArrayList<String> md) {
159
187
160
188
}
161
189
190
+ @ SuppressWarnings ("OptionalGetWithoutIsPresent" )
191
+ private Document loadDynamicWeb (String url ) {
192
+ System .setProperty ("webdriver.chrome.driver" , webDriver .get ());
193
+ var options = new ChromeOptions ();
194
+ options .addArguments ("--headless" )
195
+ .addArguments ("--disable-gpu" )
196
+ .addArguments ("--window-size=1920,1080" )
197
+ .addArguments ("--disable-dev-shm-usage" );
198
+ if (proxy != null ) {
199
+ options .addArguments ("--proxy-server=http://" + proxy .address ().toString ().substring (1 ));
200
+ }
201
+
202
+ if (driver == null ) {
203
+ driver = new ChromeDriver (options );
204
+ }
205
+
206
+ driver .get (url );
207
+
208
+ var waitEleId = args .readArg ("waitexp" );
209
+ if (waitEleId .isPresent ()) {
210
+ var wait = new WebDriverWait (driver , Duration .ofSeconds (10 ));
211
+ wait .until (
212
+ ExpectedConditions .visibilityOfElementLocated (By .cssSelector (waitEleId .val ())));
213
+ }
214
+ return Jsoup .parse (driver .getPageSource ());
215
+ }
216
+
162
217
private void downloadUrl (String folder , String url , int depth , String expression ) throws IOException {
163
218
if (StringUtils .isBlank (url )) {
164
219
return ;
@@ -172,6 +227,8 @@ private void downloadUrl(String folder, String url, int depth, String expression
172
227
Document doc ;
173
228
if (url .startsWith ("file" )) {
174
229
doc = Jsoup .parse (new File (url .substring (5 )));
230
+ } else if (webDriver .isPresent ()) {
231
+ doc = loadDynamicWeb (url );
175
232
} else {
176
233
doc = getConnection ().url (url ).get ();
177
234
}
@@ -213,9 +270,14 @@ private void downloadUrl(String folder, String url, int depth, String expression
213
270
return ;
214
271
}
215
272
216
- var links = doc .select ("a[href]" );
217
- for (Element link : links ) {
218
- var href = link .attr ("abs:href" );
273
+ var links = doc .select ("a[href]" )
274
+ .stream ()
275
+ .map (a -> cleanHref (url , a .attr ("href" )))
276
+ .filter (Objects ::nonNull )
277
+ .filter (a -> Objects .equals (baseUrl (a ), baseUrl (url )))
278
+ .filter (f -> args .readArg ("hrefFilter" ).test (f ::contains ))
279
+ .collect (Collectors .toSet ());
280
+ for (String href : links ) {
219
281
try {
220
282
downloadUrl (folder , href , depth , "false" );
221
283
} catch (Exception e ) {
@@ -224,6 +286,44 @@ private void downloadUrl(String folder, String url, int depth, String expression
224
286
}
225
287
}
226
288
289
+ private String cleanHref (String url , String relativeUrl ) {
290
+ if (relativeUrl != null && !relativeUrl .isBlank ()) {
291
+ var baseUrl = baseUrl (url );
292
+ if (relativeUrl .startsWith ("http" ) || relativeUrl .startsWith ("https" )) {
293
+ return relativeUrl ;
294
+ } else if (relativeUrl .startsWith ("/" )) {
295
+ return baseUrl + relativeUrl ;
296
+ } else {
297
+ return baseUrl + "/" + relativeUrl ;
298
+ }
299
+ }
300
+ return null ;
301
+ }
302
+
303
+ private String baseUrl (String fullUrl ) {
304
+ URL url ;
305
+ try {
306
+ url = new URL (fullUrl );
307
+ } catch (MalformedURLException e ) {
308
+ return null ;
309
+ }
310
+ var protocol = url .getProtocol ();
311
+ var host = url .getHost ();
312
+ int port = url .getPort ();
313
+ if (port == -1 ) {
314
+ port = protocol .equals ("https" ) ? 443 : 80 ;
315
+ }
316
+ var path = url .getPath ();
317
+ if (path != null ) {
318
+ var ps = path .split ("/" );
319
+ if (ps .length > 1 ) {
320
+ ps = Arrays .copyOfRange (ps , 0 , ps .length -1 );
321
+ }
322
+ path = String .join ("/" , ps );
323
+ }
324
+ return STR ."\{protocol }://\{host }\{port == 80 || port == 443 ? "" : STR .":\{port }" }\{path }" ;
325
+ }
326
+
227
327
private void downloadImages (Elements imgs , String folder , String expression ) throws IOException {
228
328
boolean removeImg = "false" .equals (expression );
229
329
if (!removeImg ) {
@@ -278,6 +378,9 @@ public Args config() {
278
378
.arg (new Arg ("clean" , "false" , "remove css or javascript" , false , null ))
279
379
.arg (new Arg ("cleanexp" , null , "clean element by CSS-like element selector" , false , "a[href]" ))
280
380
.arg (new Arg ("replace" , null , "replace some text" , false , "test+" ))
381
+ .arg (new Arg ("webDriver" , null , "web driver path" , false , "/usr/local/bin/chromedriver" ))
382
+ .arg (new Arg ("waitexp" , null , "wait element by CSS-like element selector" , false , "a[href]" ))
383
+ .arg (new Arg ("hrefFilter" , null , "sub-link filter" , false , "demo" ))
281
384
.runnerTypes (EnumSet .allOf (RunnerType .class ));
282
385
}
283
386
0 commit comments