Skip to content

Commit afdba22

Browse files
committed
htmlDown: supports selenium #log
1 parent e5fae4f commit afdba22

File tree

2 files changed

+119
-5
lines changed

2 files changed

+119
-5
lines changed

src/main/java/net/cofcool/sourcebox/internal/HtmlDownloader.java

Lines changed: 108 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66
import java.io.FileOutputStream;
77
import java.io.IOException;
88
import java.net.InetSocketAddress;
9+
import java.net.MalformedURLException;
910
import java.net.Proxy;
1011
import java.net.Proxy.Type;
12+
import java.net.URL;
1113
import java.nio.charset.StandardCharsets;
1214
import java.nio.file.Paths;
15+
import java.time.Duration;
1316
import java.util.ArrayList;
1417
import java.util.Arrays;
1518
import java.util.Base64;
@@ -18,6 +21,8 @@
1821
import java.util.HashSet;
1922
import java.util.List;
2023
import java.util.Map;
24+
import java.util.Objects;
25+
import java.util.Optional;
2126
import java.util.Set;
2227
import java.util.function.Function;
2328
import java.util.stream.Collectors;
@@ -44,6 +49,12 @@
4449
import org.jsoup.safety.Cleaner;
4550
import org.jsoup.safety.Safelist;
4651
import org.jsoup.select.Elements;
52+
import org.openqa.selenium.By;
53+
import org.openqa.selenium.WebDriver;
54+
import org.openqa.selenium.chrome.ChromeDriver;
55+
import org.openqa.selenium.chrome.ChromeOptions;
56+
import org.openqa.selenium.support.ui.ExpectedConditions;
57+
import org.openqa.selenium.support.ui.WebDriverWait;
4758

4859
@CustomLog
4960
public class HtmlDownloader implements Tool {
@@ -60,9 +71,13 @@ public class HtmlDownloader implements Tool {
6071
private Set<OutputType> outputTypes = EnumSet.of(OutputType.html);
6172
private Proxy proxy;
6273
private String filter;
74+
private Optional<String> webDriver;
6375
private ToolContext context;
76+
private Args args;
6477

6578
private Connection connection;
79+
private WebDriver driver;
80+
6681

6782
@Override
6883
public ToolName name() {
@@ -73,6 +88,7 @@ public ToolName name() {
7388
public void run(Args args) throws Exception {
7489
var urls = new ArrayList<String>();
7590
context = args.getContext();
91+
this.args = args;
7692

7793
args.readArg("urlFile").ifPresent(a -> {
7894
try {
@@ -89,6 +105,7 @@ public void run(Args args) throws Exception {
89105
.collect(Collectors.toSet());
90106
var img = args.readArg("img").val();
91107
filter = args.readArg("filter").getVal().orElse(null);
108+
webDriver = args.readArg("webDriver").getVal().filter(a -> !a.isBlank());
92109
REPLACER = new Replacer(args.readArg("replace").getVal().orElse(null));
93110
cleanexp = args.readArg("cleanexp").getVal().orElse(null);
94111

@@ -104,17 +121,28 @@ public void run(Args args) throws Exception {
104121
});
105122
depth = Integer.parseInt(args.readArg("depth").val());
106123

107-
for (String url : urls) {
108-
downloadUrl(out, url, depth, img);
124+
try {
125+
for (String url : urls) {
126+
downloadUrl(out, url, depth, img);
127+
}
128+
} finally {
129+
release();
109130
}
110131

132+
111133
history.clear();
112134

113135
for (OutputType type : outputTypes) {
114136
type.finished();
115137
}
116138
}
117139

140+
private void release() {
141+
if (driver != null) {
142+
driver.quit();
143+
}
144+
}
145+
118146
private Connection getConnection() {
119147
if (connection == null) {
120148
connection = Jsoup.newSession().proxy(proxy);
@@ -159,6 +187,33 @@ private static void convertTagToMd(Element element, ArrayList<String> md) {
159187

160188
}
161189

190+
@SuppressWarnings("OptionalGetWithoutIsPresent")
191+
private Document loadDynamicWeb(String url) {
192+
System.setProperty("webdriver.chrome.driver", webDriver.get());
193+
var options = new ChromeOptions();
194+
options.addArguments("--headless")
195+
.addArguments("--disable-gpu")
196+
.addArguments("--window-size=1920,1080")
197+
.addArguments("--disable-dev-shm-usage");
198+
if (proxy != null) {
199+
options.addArguments("--proxy-server=http://" + proxy.address().toString().substring(1));
200+
}
201+
202+
if (driver == null) {
203+
driver = new ChromeDriver(options);
204+
}
205+
206+
driver.get(url);
207+
208+
var waitEleId = args.readArg("waitexp");
209+
if (waitEleId.isPresent()) {
210+
var wait = new WebDriverWait(driver, Duration.ofSeconds(10));
211+
wait.until(
212+
ExpectedConditions.visibilityOfElementLocated(By.cssSelector(waitEleId.val())));
213+
}
214+
return Jsoup.parse(driver.getPageSource());
215+
}
216+
162217
private void downloadUrl(String folder, String url, int depth, String expression) throws IOException {
163218
if (StringUtils.isBlank(url)) {
164219
return;
@@ -172,6 +227,8 @@ private void downloadUrl(String folder, String url, int depth, String expression
172227
Document doc;
173228
if (url.startsWith("file")) {
174229
doc = Jsoup.parse(new File(url.substring(5)));
230+
} else if (webDriver.isPresent()) {
231+
doc = loadDynamicWeb(url);
175232
} else {
176233
doc = getConnection().url(url).get();
177234
}
@@ -213,9 +270,14 @@ private void downloadUrl(String folder, String url, int depth, String expression
213270
return;
214271
}
215272

216-
var links = doc.select("a[href]");
217-
for (Element link : links) {
218-
var href = link.attr("abs:href");
273+
var links = doc.select("a[href]")
274+
.stream()
275+
.map(a -> cleanHref(url, a.attr("href")))
276+
.filter(Objects::nonNull)
277+
.filter(a -> Objects.equals(baseUrl(a), baseUrl(url)))
278+
.filter(f -> args.readArg("hrefFilter").test(f::contains))
279+
.collect(Collectors.toSet());
280+
for (String href : links) {
219281
try {
220282
downloadUrl(folder, href, depth, "false");
221283
} catch (Exception e) {
@@ -224,6 +286,44 @@ private void downloadUrl(String folder, String url, int depth, String expression
224286
}
225287
}
226288

289+
private String cleanHref(String url, String relativeUrl) {
290+
if (relativeUrl != null && !relativeUrl.isBlank()) {
291+
var baseUrl = baseUrl(url);
292+
if (relativeUrl.startsWith("http") || relativeUrl.startsWith("https")) {
293+
return relativeUrl;
294+
} else if (relativeUrl.startsWith("/")) {
295+
return baseUrl + relativeUrl;
296+
} else {
297+
return baseUrl + "/" + relativeUrl;
298+
}
299+
}
300+
return null;
301+
}
302+
303+
private String baseUrl(String fullUrl) {
304+
URL url;
305+
try {
306+
url = new URL(fullUrl);
307+
} catch (MalformedURLException e) {
308+
return null;
309+
}
310+
var protocol = url.getProtocol();
311+
var host = url.getHost();
312+
int port = url.getPort();
313+
if (port == -1) {
314+
port = protocol.equals("https") ? 443 : 80;
315+
}
316+
var path = url.getPath();
317+
if (path != null) {
318+
var ps = path.split("/");
319+
if (ps.length > 1) {
320+
ps = Arrays.copyOfRange(ps, 0, ps.length-1);
321+
}
322+
path = String.join("/", ps);
323+
}
324+
return STR."\{protocol}://\{host}\{port == 80 || port == 443 ? "" : STR.":\{port}"}\{path}";
325+
}
326+
227327
private void downloadImages(Elements imgs, String folder, String expression) throws IOException {
228328
boolean removeImg = "false".equals(expression);
229329
if (!removeImg) {
@@ -278,6 +378,9 @@ public Args config() {
278378
.arg(new Arg("clean", "false", "remove css or javascript", false, null))
279379
.arg(new Arg("cleanexp", null, "clean element by CSS-like element selector", false, "a[href]"))
280380
.arg(new Arg("replace", null, "replace some text", false, "test+"))
381+
.arg(new Arg("webDriver", null, "web driver path", false, "/usr/local/bin/chromedriver"))
382+
.arg(new Arg("waitexp", null, "wait element by CSS-like element selector", false, "a[href]"))
383+
.arg(new Arg("hrefFilter", null, "sub-link filter", false, "demo"))
281384
.runnerTypes(EnumSet.allOf(RunnerType.class));
282385
}
283386

src/test/java/net/cofcool/sourcebox/internal/HtmlDownloaderTest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import net.cofcool.sourcebox.Utils;
1313
import net.cofcool.sourcebox.internal.HtmlDownloader.OutputType;
1414
import org.jsoup.Jsoup;
15+
import org.junit.jupiter.api.Disabled;
1516
import org.junit.jupiter.api.Test;
1617
import org.junit.jupiter.api.io.TempDir;
1718

@@ -154,6 +155,16 @@ void runWithUrlFile() throws Exception {
154155
assertTrue(files.length > 0);
155156
}
156157

158+
@Disabled
159+
@Test
160+
void runWithWebDriver() throws Exception {
161+
instance().run(args
162+
.arg("url", "https://bing.com")
163+
.arg("webDriver", "/usr/local/bin/chromedriver")
164+
.arg("out", "/tmp/aa")
165+
);
166+
}
167+
157168
@Override
158169
protected Tool instance() {
159170
return new HtmlDownloader();

0 commit comments

Comments
 (0)