Skip to content

Commit 9bb2417

Browse files
zyw61483zhaoyiwei
andauthored
修改SmartContentSelector threshold可定制化 (#1183)
* 修改SmartContentSelector threshold可定制化 * 修改SmartContentSelector threshold可定制化 --------- Co-authored-by: zhaoyiwei <zhaoyiwei@zhongan.com>
1 parent 8dc4174 commit 9bb2417

File tree

3 files changed

+15
-1
lines changed

3 files changed

+15
-1
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ public Selectable smartContent() {
3131
return select(smartContentSelector, getSourceTexts());
3232
}
3333

34+
public Selectable smartContent(int threshold) {
35+
SmartContentSelector smartContentSelector = Selectors.smartContent(threshold);
36+
return select(smartContentSelector, getSourceTexts());
37+
}
38+
3439
@Override
3540
public Selectable links() {
3641
return selectElements(new LinksSelector());

webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() {
2020
return new SmartContentSelector();
2121
}
2222

23+
public static SmartContentSelector smartContent(int threshold) {
24+
return new SmartContentSelector(threshold);
25+
}
26+
2327
public static CssSelector $(String expr) {
2428
return new CssSelector(expr);
2529
}

webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@
1616
@Experimental
1717
public class SmartContentSelector implements Selector {
1818

19+
private int threshold = 86;
20+
1921
public SmartContentSelector() {
2022
}
2123

24+
public SmartContentSelector(int threshold) {
25+
this.threshold = threshold;
26+
}
27+
2228
@Override
2329
public String select(String html) {
2430
html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
@@ -29,7 +35,6 @@ public String select(String html) {
2935
html = html.replaceAll("(?is)<.*?>", "");
3036
List<String> lines;
3137
int blocksWidth =3;
32-
int threshold =86;
3338
int start;
3439
int end;
3540
StringBuilder text = new StringBuilder();

0 commit comments

Comments
 (0)