Skip to content

Commit d39394b

Browse files
author
肖明晨
committed
2
1 parent 2987dbd commit d39394b

File tree

12 files changed

+225
-68
lines changed

12 files changed

+225
-68
lines changed

pom.xml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,34 @@
123123
<version>1.0.0</version>
124124
</dependency>
125125

126+
<dependency>
127+
<groupId>cn.hutool</groupId>
128+
<artifactId>hutool-core</artifactId>
129+
<version>4.6.10</version>
130+
</dependency>
131+
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
126132
<dependency>
127133
<groupId>net.sourceforge.tess4j</groupId>
128134
<artifactId>tess4j</artifactId>
129-
<version>2.0.1</version>
135+
<version>4.4.1</version>
136+
<exclusions>
137+
<exclusion>
138+
<groupId>org.slf4j</groupId>
139+
<artifactId>jul-to-slf4j</artifactId>
140+
</exclusion>
141+
<exclusion>
142+
<groupId>org.slf4j</groupId>
143+
<artifactId>jcl-over-slf4j</artifactId>
144+
</exclusion>
145+
<exclusion>
146+
<groupId>org.slf4j</groupId>
147+
<artifactId>log4j-over-slf4j</artifactId>
148+
</exclusion>
149+
<exclusion>
150+
<groupId>ch.qos.logback</groupId>
151+
<artifactId>logback-classic</artifactId>
152+
</exclusion>
153+
</exclusions>
130154
</dependency>
131155
</dependencies>
132156

src/main/java/com/cc/etherscan/io/common/Constants.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ public class Constants {
99

1010
public static final String URL_DETAILS = "https://etherscan\\.io/address/\\w\\#code";
1111
public static final String SUB_CAT = "https://www.yaofangwang\\.com/catalog-\\w+\\.html";
12-
public static final String SUB_CAT_SP = "https://www.yaofangwang\\.com/catalog-\\w\\-\\w+\\.html";
13-
public static final String SUB_CAT_FG = "https://www.yaofangwang.com/catalog-%s-%s.html";
12+
public static final String SUB_CAT_SP = "https://www.yaofangwang\\.com/catalog-\\w+\\-p\\w+\\.html";
13+
public static final String SUB_CAT_FG = "https://www.yaofangwang.com/catalog-%s-p%s.html";
1414
public static final String SUB_CAT_MED_DETAILS = "https://www.yaofangwang.com/medicine-\\w+\\.html";
1515
}

src/main/java/com/cc/etherscan/io/processor/PharmacyProcessor.java

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
package com.cc.etherscan.io.processor;
22

33
import com.cc.etherscan.io.common.Constants;
4+
import com.cc.etherscan.io.entity.YfwProductInfo;
5+
import com.cc.etherscan.io.mapper.YfwProductInfoDao;
6+
import com.cc.etherscan.io.util.Tess4jUtils;
47
import lombok.extern.slf4j.Slf4j;
8+
import net.sourceforge.tess4j.TesseractException;
59
import org.apache.commons.lang.StringUtils;
6-
import org.jsoup.select.Elements;
710
import us.codecraft.webmagic.Page;
811
import us.codecraft.webmagic.Site;
912
import us.codecraft.webmagic.processor.PageProcessor;
10-
import us.codecraft.webmagic.selector.Html;
1113

14+
import java.io.IOException;
1215
import java.util.List;
1316

1417
import static com.cc.etherscan.io.common.Constants.SUB_CAT_MED_DETAILS;
@@ -24,84 +27,124 @@ public class PharmacyProcessor implements PageProcessor {
2427

2528
private Site site;
2629

30+
public PharmacyProcessor(YfwProductInfoDao yfwProductInfoDao) {
31+
this.yfwProductInfoDao = yfwProductInfoDao;
32+
}
33+
34+
private YfwProductInfoDao yfwProductInfoDao;
35+
2736
@Override
2837
public void process(Page page) {
2938
// subcat
3039
String requestUrl = page.getUrl().get();
3140
List<String> catList = page.getHtml().xpath("//div[@class='subcat']").links().all();
32-
if (catList.size() > 0) {
33-
log.info("分类地址数量: {}", catList.size());
34-
}
3541
if (page.getUrl().regex(Constants.SUB_CAT).match()) {
3642
// 获取总页数,得到当前适应症下的所有二级页面
3743
//
3844
String catId = requestUrl.substring(requestUrl.indexOf("-") + 1, requestUrl.indexOf(".html"));
39-
String s = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[3]/div[3]/span").get();
45+
String s = page.getHtml().xpath("////*[@id=\"wrap\"]/div[2]/div[3]/div[3]/span/text()").get();
4046
String[] split = StringUtils.split(s, "/");
4147
if (split.length > 1) {
4248
int totalPage = Integer.parseInt(StringUtils.trim(split[1]));
49+
log.info("【SUB_CAT】页面: {}, 子页面数:{}", page.getUrl().get(), totalPage);
4350
for (int i = 1; i <= totalPage; i++) {
4451
String catItemUrl = String.format(Constants.SUB_CAT_FG, catId, i);
4552
page.addTargetRequest(catItemUrl);
4653
}
4754
}
4855
} else if (page.getUrl().regex(SUB_CAT_SP).match()) {
49-
List<String> all = page.getHtml().xpath("//div[@class=\"goodlist\"]/a[@class=\"photo\"]").links().all();
56+
List<String> all = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/ul/li/div/a[@class=\"photo\"]").links().all();
57+
log.info("【SUB_CAT_SP】页面: {}, 子页面数:{}", page.getUrl().get(), all.size());
5058
page.addTargetRequests(all);
5159
} else if (page.getUrl().regex(SUB_CAT_MED_DETAILS).match()) {
60+
log.info("【SUB_CAT_MED_DETAILS】页面: {}", page.getUrl().get());
61+
YfwProductInfo item = new YfwProductInfo();
62+
String productId = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[1]/div[2]/div/text()").get();
63+
productId = productId.substring(productId.indexOf(":") + 1, productId.length() - 1);
64+
item.setProductId(productId);
5265
// 商品名
5366
String productName = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/h1/strong/span/text()").get();
67+
item.setProductName(productName);
5468
// 通用名
5569
String commodityName = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[1]/strong/text()").get();
70+
item.setCommodityName(commodityName);
5671
// 商标
5772
String trademark = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[2]/text()").get();
73+
item.setTradeMark(trademark);
5874
// 挤型
5975
String jixing = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[4]/text()").get();
76+
item.setDosage(jixing);
6077
// 规格
6178
String spec = page.getHtml().xpath("//*[@id=\"standardOther\"]/div[1]/text()").get();
79+
item.setSpec(spec);
6280
// 生产厂家
6381
String cj = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[5]/text()").get();
82+
item.setFactory(cj);
6483
// 有效期
6584
String ex = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[6]/label/text()").get();
85+
item.setExpired(ex);
86+
87+
String pzwh = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[7]/div/img/@src").get();
88+
String approvalNumber = "";
89+
try {
90+
if (StringUtils.isNotEmpty(pzwh) && !StringUtils.equalsIgnoreCase(pzwh, "null")) {
91+
approvalNumber = Tess4jUtils.getApprovalNumber("https:" + pzwh);
92+
item.setApprovalNumber(approvalNumber);
93+
}
94+
} catch (TesseractException e) {
95+
e.printStackTrace();
96+
} catch (Exception e) {
97+
e.printStackTrace();
98+
}
99+
// 浏览次数
100+
String llcs = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[9]/text()").get();
101+
item.setViewCount(llcs);
66102
// 适应症
67103
String syz = page.getHtml().xpath("//*[@id=\"wrap\"]/div[2]/div[2]/div/dl[1]/dd[8]/strong/text()").get();
104+
item.setIndication(syz);
68105
// 说明书_标题
69106
String sms_title = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/h2/text()").get();
107+
item.setSmsTitle(sms_title);
70108
String sms_tips = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/div[1]/text()").get();
71-
109+
item.setSmsTips(sms_tips);
72110
// 药品名称
73111
String sms_name_cname = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/div[3]/dl/dd[1]/text()").get();
112+
item.setSmsCname(sms_name_cname);
74113
String sms_name_ename = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/div[3]/dl/dd[3]/text()").get();
114+
item.setSmsEname(sms_name_ename);
75115
String sms_name_py = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/div[3]/dl/dd[4]/text()").get();
116+
item.setSmsPy(sms_name_py);
76117
// 执行标准
77118
String sms_zxbz = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[1]/p/text()").get();
119+
item.setSmsZxbz(sms_zxbz);
78120
// 性状
79121
String sms_xz = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[2]/p/text()").get();
122+
item.setSmsXz(sms_xz);
80123
// 组方/成份
81124
String sms_cf = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[3]/p/text()").get();
125+
item.setSmsCf(sms_cf);
82126
// 功能与主治
83127
String sms_gnzz = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[4]/p/text()").get();
128+
item.setSmsGnzz(sms_gnzz);
84129
// 用法用量
85130
String sms_yfyl = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[5]/p/text()").get();
131+
item.setSmsYfyl(sms_yfyl);
86132
// 不良反应
87133
String sms_blfy = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[6]/p/text()").get();
134+
item.setSmsBlfy(sms_blfy);
88135
// 禁忌症
89136
String sms_jjz = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[7]/p/text()").get();
137+
item.setSmsJjz(sms_jjz);
90138
// 药物相互作用
91139
String sms_ywzy = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[8]/p/text()").get();
140+
item.setSmsYwzy(sms_ywzy);
92141
// 贮藏
93142
String sms_zc = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[9]/p/text()").get();
94-
String productImgText = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[10]/textarea/text()").get();
95-
Html html = new Html(productImgText);
96-
Page page1 = new Page();
97-
page1.setHtml(html);
98-
List<String> all = page1.getHtml().xpath("//*a/@href").all();
99-
String sms_img = page.getHtml().xpath("//*[@id=\"guide\"]/div[1]/dl/dd[11]/textarea/text()").get();
100-
Html sms_img_html = new Html(sms_img);
101-
Elements a = sms_img_html.getDocument().getElementsByTag("a");
102-
String href = a.get(0).attr("href");
103-
System.out.println(sms_blfy);
143+
item.setSmsZc(sms_zc);
144+
int insert = yfwProductInfoDao.insert(item);
145+
log.info("【SUB_CAT_MED_DETAILS】页面: {}, 保存结果:{}", page.getUrl().get(), insert);
104146
} else {
147+
log.info("【INDEX】页面: {}, 子页面数:{}", page.getUrl().get(), catList.size());
105148
page.addTargetRequests(catList);
106149
}
107150
}

src/main/java/com/cc/etherscan/io/schedule/EtherEumSchedule.java

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
package com.cc.etherscan.io.schedule;
22

33
import com.cc.etherscan.io.mapper.EtherContractMapper;
4+
import com.cc.etherscan.io.mapper.YfwProductInfoDao;
45
import com.cc.etherscan.io.pipeline.EthereumPipeline;
5-
import com.cc.etherscan.io.processor.EthereumContractProcessor;
6+
import com.cc.etherscan.io.processor.PharmacyProcessor;
67
import org.springframework.beans.factory.annotation.Value;
7-
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
88
import org.springframework.data.redis.core.RedisTemplate;
9-
import org.springframework.scheduling.annotation.EnableScheduling;
10-
import org.springframework.scheduling.annotation.Scheduled;
119
import org.springframework.stereotype.Component;
1210
import us.codecraft.webmagic.Spider;
1311

@@ -30,6 +28,9 @@ public class EtherEumSchedule {
3028
@Resource
3129
private EtherContractMapper etherContractMapper;
3230

31+
@Resource
32+
private YfwProductInfoDao yfwProductInfoDao;
33+
3334
@Value("${etherscan.pageSize}")
3435
private int pageSize = 0;
3536

@@ -48,18 +49,11 @@ public class EtherEumSchedule {
4849
//@Scheduled(cron="${etherscan.scanIntervalCron}")
4950
@PostConstruct
5051
public void start() {
51-
for (int i = startPage; i<= totalPage; i++) {
52-
Spider.create(new EthereumContractProcessor(redisTemplate, etherContractMapper))
53-
.addUrl("https://etherscan.io/contractsVerified/" + i + "?ps=" + pageSize)
54-
.addPipeline(ethereumPipeline)
55-
.thread(threadCount)
56-
.run();
57-
try {
58-
Thread.sleep(intervalSeconds * 1000);
59-
} catch (InterruptedException e) {
60-
e.printStackTrace();
61-
}
62-
}
63-
52+
Spider.create(new PharmacyProcessor(yfwProductInfoDao))
53+
// .addUrl("https://www.yaofangwang.com/medicine-261518.html")
54+
// .addUrl("https://www.yaofangwang.com/catalog-10-p8.html")
55+
.addUrl("https://www.yaofangwang.com")
56+
.thread(1)
57+
.run();
6458
}
6559
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package com.cc.etherscan.io.util;
2+
3+
import cn.hutool.core.util.StrUtil;
4+
import lombok.experimental.UtilityClass;
5+
import lombok.extern.slf4j.Slf4j;
6+
import net.sourceforge.tess4j.ITesseract;
7+
import net.sourceforge.tess4j.Tesseract;
8+
import net.sourceforge.tess4j.TesseractException;
9+
import org.apache.commons.lang3.StringUtils;
10+
11+
import javax.imageio.ImageIO;
12+
import java.awt.*;
13+
import java.awt.image.BufferedImage;
14+
import java.io.IOException;
15+
import java.net.URL;
16+
17+
/**
18+
* Tess4jUtils类
19+
* <p>
20+
*
21+
* @author wxd(ttzommed @ foxmail.com)
22+
*/
23+
@UtilityClass
24+
@Slf4j
25+
public class Tess4jUtils {
26+
//OCR识别
27+
private static ITesseract instance = new Tesseract();
28+
29+
static {
30+
instance.setDatapath("D:\\tess4j");
31+
instance.setLanguage("chi_sim+eng");
32+
}
33+
34+
public String getApprovalNumber(String url) throws TesseractException, IOException {
35+
log.info("url: {}", url);
36+
if (StringUtils.isBlank(url)) {
37+
return null;
38+
}
39+
BufferedImage inputImage = ImageIO.read(new URL(url));
40+
return StrUtil.removeAll(instance.doOCR(resize(inputImage, 3)), " ");
41+
}
42+
43+
public BufferedImage resize(BufferedImage img, int SCALE) {
44+
BufferedImage bi = new BufferedImage(SCALE * img.getWidth(null), SCALE
45+
* img.getHeight(null), BufferedImage.TYPE_INT_ARGB);
46+
Graphics2D grph = (Graphics2D) bi.getGraphics();
47+
grph.scale(SCALE, SCALE);
48+
grph.drawImage(img, 0, 0, null);
49+
grph.dispose();
50+
return bi;
51+
}
52+
}

src/main/resources/application-dev.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@ spring:
44
port: 30379
55
password: 2aVzLk8jcmJHh8Xo
66
datasource:
7-
url: jdbc:mysql://120.79.47.111:3307/esc?useUnicode=true&characterEncoding=utf8
8-
username: esc
9-
password: d8Mo9u1JroPbKtXG
7+
url: jdbc:mysql://qa.tidb.hcyy.top:4000/mutants_bi?useUnicode=true&characterEncoding=utf8
8+
username: root
9+
password: 123456

src/main/resources/logback.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,6 @@
2828
<logger name="java.sql.Connection" level="ERROR"/>
2929
<logger name="java.sql.Statement" level="ERROR"/>
3030
<logger name="java.sql.PreparedStatement" level="ERROR"/>
31-
<logger name="org.apache" level="DEBUG"/>
31+
<logger name="org.apache" level="ERROR"/>
3232
<logger name="com.cc.etherscan.io" level="DEBUG"/>
3333
</configuration>

src/test/java/com/cc/etherscan/io/ImgTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public void should_1 (){
1313
Noise engine = new Noise();
1414

1515
// 2.active
16-
int retAct=engine.active("");
16+
int retAct=engine.active("a");
1717
if (Const.MACT_SUCCESS == retAct) {
1818
System.out.println(Const.getString(Const.MACT_SUCCESS));
1919
} else if (Const.MACT_TIME_PASSED == retAct) {
@@ -30,7 +30,7 @@ public void should_1 (){
3030

3131
long startTime = System.currentTimeMillis();
3232

33-
int ret = engine.deWatermark("/Users/carlosxiao/Desktop/1.jpg", outFilename,"mask.bmp");
33+
int ret = engine.deWatermark("C:\\Users\\CarlosXiao\\Desktop\\222.jpg", outFilename,"C:\\Users\\CarlosXiao\\Desktop\\mask.bmp");
3434
// int ret = engine.deSpot("circle.jpg", outFilename);
3535
// int ret = engine.deFog("mist.jpg", outFilename);
3636
// int ret = engine.deDark("dark.jpg", outFilename);

src/test/java/com/cc/etherscan/io/JsonTest.java

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,7 @@
22

33
import com.fasterxml.jackson.core.type.TypeReference;
44
import com.fasterxml.jackson.databind.ObjectMapper;
5-
import com.sun.image.codec.jpeg.JPEGCodec;
6-
import com.sun.image.codec.jpeg.JPEGImageDecoder;
75
import org.junit.Test;
8-
import sun.awt.image.PNGImageDecoder;
96

107
import javax.imageio.ImageIO;
118
import java.awt.*;
@@ -64,23 +61,7 @@ public void should_2() throws Exception {
6461
String outputPATH = PATH_DEST + "new/new.jpg";
6562
// 得到文件流
6663
FileInputStream fileInputStream = new FileInputStream(new File(pathSrc));
67-
// 得到输入的编码器,将文件流进行jpg格式编码
68-
JPEGImageDecoder decoder = JPEGCodec.createJPEGDecoder(fileInputStream);
69-
// 得到编码后的图片对象
70-
BufferedImage image = decoder.decodeAsBufferedImage();
71-
Graphics g = image.getGraphics();
72-
InputStream imageSeal = new FileInputStream(new File(sealFile));
73-
// 得到输入的编码器,将文件流进行jpg格式编码
74-
// 得到编码后的图片对象
75-
BufferedImage image2 = ImageIO.read(new File(sealFile));
76-
// 加盖图片章
77-
ImageObserver imageObserver = null;
78-
int x = image.getWidth() - (image2.getWidth() + 400);
79-
int y = image.getHeight() - (image2.getHeight() + 600);
80-
g.drawImage(image2, x, y, imageObserver);
81-
g.dispose();
82-
ImageIO.write(image, "jpeg", new File(outputPATH));
83-
System.out.println("ok");
64+
8465
}
8566

8667
public static void saveFixedBoundIcon(File imageFile, int height, int width) throws Exception {

src/test/java/com/cc/etherscan/io/OcrTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ public class OcrTest {
1212
@Test
1313
public void should_1 () throws TesseractException {
1414
ITesseract instance = new Tesseract();
15-
instance.setDatapath("/Users/carlosxiao/Desktop/");
16-
instance.setLanguage("chi_sim");
17-
String result = instance.doOCR(new File("/Users/carlosxiao/Desktop/2222.jpg"));
15+
instance.setDatapath("D:\\tess4j");
16+
instance.setLanguage("chi_sim+eng");
17+
String result = instance.doOCR(new File("C:\\Users\\CarlosXiao\\Desktop\\222.jpg"));
1818
System.out.println(result);
1919
}
2020
}

0 commit comments

Comments
 (0)