1
1
package com .cc .etherscan .io .processor ;
2
2
3
3
import com .cc .etherscan .io .common .Constants ;
4
+ import com .cc .etherscan .io .entity .YfwProductInfo ;
5
+ import com .cc .etherscan .io .mapper .YfwProductInfoDao ;
6
+ import com .cc .etherscan .io .util .Tess4jUtils ;
4
7
import lombok .extern .slf4j .Slf4j ;
8
+ import net .sourceforge .tess4j .TesseractException ;
5
9
import org .apache .commons .lang .StringUtils ;
6
- import org .jsoup .select .Elements ;
7
10
import us .codecraft .webmagic .Page ;
8
11
import us .codecraft .webmagic .Site ;
9
12
import us .codecraft .webmagic .processor .PageProcessor ;
10
- import us .codecraft .webmagic .selector .Html ;
11
13
14
+ import java .io .IOException ;
12
15
import java .util .List ;
13
16
14
17
import static com .cc .etherscan .io .common .Constants .SUB_CAT_MED_DETAILS ;
@@ -24,84 +27,124 @@ public class PharmacyProcessor implements PageProcessor {
24
27
25
28
private Site site ;
26
29
30
+ public PharmacyProcessor (YfwProductInfoDao yfwProductInfoDao ) {
31
+ this .yfwProductInfoDao = yfwProductInfoDao ;
32
+ }
33
+
34
+ private YfwProductInfoDao yfwProductInfoDao ;
35
+
27
36
@ Override
28
37
public void process (Page page ) {
29
38
// subcat
30
39
String requestUrl = page .getUrl ().get ();
31
40
List <String > catList = page .getHtml ().xpath ("//div[@class='subcat']" ).links ().all ();
32
- if (catList .size () > 0 ) {
33
- log .info ("分类地址数量: {}" , catList .size ());
34
- }
35
41
if (page .getUrl ().regex (Constants .SUB_CAT ).match ()) {
36
42
// 获取总页数,得到当前适应症下的所有二级页面
37
43
//
38
44
String catId = requestUrl .substring (requestUrl .indexOf ("-" ) + 1 , requestUrl .indexOf (".html" ));
39
- String s = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[3]/div[3]/span" ).get ();
45
+ String s = page .getHtml ().xpath ("//// *[@id=\" wrap\" ]/div[2]/div[3]/div[3]/span/text() " ).get ();
40
46
String [] split = StringUtils .split (s , "/" );
41
47
if (split .length > 1 ) {
42
48
int totalPage = Integer .parseInt (StringUtils .trim (split [1 ]));
49
+ log .info ("【SUB_CAT】页面: {}, 子页面数:{}" , page .getUrl ().get (), totalPage );
43
50
for (int i = 1 ; i <= totalPage ; i ++) {
44
51
String catItemUrl = String .format (Constants .SUB_CAT_FG , catId , i );
45
52
page .addTargetRequest (catItemUrl );
46
53
}
47
54
}
48
55
} else if (page .getUrl ().regex (SUB_CAT_SP ).match ()) {
49
- List <String > all = page .getHtml ().xpath ("//div[@class=\" goodlist\" ]/a[@class=\" photo\" ]" ).links ().all ();
56
+ List <String > all = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/ul/li/div/a[@class=\" photo\" ]" ).links ().all ();
57
+ log .info ("【SUB_CAT_SP】页面: {}, 子页面数:{}" , page .getUrl ().get (), all .size ());
50
58
page .addTargetRequests (all );
51
59
} else if (page .getUrl ().regex (SUB_CAT_MED_DETAILS ).match ()) {
60
+ log .info ("【SUB_CAT_MED_DETAILS】页面: {}" , page .getUrl ().get ());
61
+ YfwProductInfo item = new YfwProductInfo ();
62
+ String productId = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[1]/div[2]/div/text()" ).get ();
63
+ productId = productId .substring (productId .indexOf (":" ) + 1 , productId .length () - 1 );
64
+ item .setProductId (productId );
52
65
// 商品名
53
66
String productName = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/h1/strong/span/text()" ).get ();
67
+ item .setProductName (productName );
54
68
// 通用名
55
69
String commodityName = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[1]/strong/text()" ).get ();
70
+ item .setCommodityName (commodityName );
56
71
// 商标
57
72
String trademark = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[2]/text()" ).get ();
73
+ item .setTradeMark (trademark );
58
74
// 挤型
59
75
String jixing = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[4]/text()" ).get ();
76
+ item .setDosage (jixing );
60
77
// 规格
61
78
String spec = page .getHtml ().xpath ("//*[@id=\" standardOther\" ]/div[1]/text()" ).get ();
79
+ item .setSpec (spec );
62
80
// 生产厂家
63
81
String cj = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[5]/text()" ).get ();
82
+ item .setFactory (cj );
64
83
// 有效期
65
84
String ex = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[6]/label/text()" ).get ();
85
+ item .setExpired (ex );
86
+
87
+ String pzwh = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[7]/div/img/@src" ).get ();
88
+ String approvalNumber = "" ;
89
+ try {
90
+ if (StringUtils .isNotEmpty (pzwh ) && !StringUtils .equalsIgnoreCase (pzwh , "null" )) {
91
+ approvalNumber = Tess4jUtils .getApprovalNumber ("https:" + pzwh );
92
+ item .setApprovalNumber (approvalNumber );
93
+ }
94
+ } catch (TesseractException e ) {
95
+ e .printStackTrace ();
96
+ } catch (Exception e ) {
97
+ e .printStackTrace ();
98
+ }
99
+ // 浏览次数
100
+ String llcs = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[9]/text()" ).get ();
101
+ item .setViewCount (llcs );
66
102
// 适应症
67
103
String syz = page .getHtml ().xpath ("//*[@id=\" wrap\" ]/div[2]/div[2]/div/dl[1]/dd[8]/strong/text()" ).get ();
104
+ item .setIndication (syz );
68
105
// 说明书_标题
69
106
String sms_title = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/h2/text()" ).get ();
107
+ item .setSmsTitle (sms_title );
70
108
String sms_tips = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/div[1]/text()" ).get ();
71
-
109
+ item . setSmsTips ( sms_tips );
72
110
// 药品名称
73
111
String sms_name_cname = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/div[3]/dl/dd[1]/text()" ).get ();
112
+ item .setSmsCname (sms_name_cname );
74
113
String sms_name_ename = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/div[3]/dl/dd[3]/text()" ).get ();
114
+ item .setSmsEname (sms_name_ename );
75
115
String sms_name_py = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/div[3]/dl/dd[4]/text()" ).get ();
116
+ item .setSmsPy (sms_name_py );
76
117
// 执行标准
77
118
String sms_zxbz = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[1]/p/text()" ).get ();
119
+ item .setSmsZxbz (sms_zxbz );
78
120
// 性状
79
121
String sms_xz = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[2]/p/text()" ).get ();
122
+ item .setSmsXz (sms_xz );
80
123
// 组方/成份
81
124
String sms_cf = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[3]/p/text()" ).get ();
125
+ item .setSmsCf (sms_cf );
82
126
// 功能与主治
83
127
String sms_gnzz = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[4]/p/text()" ).get ();
128
+ item .setSmsGnzz (sms_gnzz );
84
129
// 用法用量
85
130
String sms_yfyl = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[5]/p/text()" ).get ();
131
+ item .setSmsYfyl (sms_yfyl );
86
132
// 不良反应
87
133
String sms_blfy = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[6]/p/text()" ).get ();
134
+ item .setSmsBlfy (sms_blfy );
88
135
// 禁忌症
89
136
String sms_jjz = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[7]/p/text()" ).get ();
137
+ item .setSmsJjz (sms_jjz );
90
138
// 药物相互作用
91
139
String sms_ywzy = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[8]/p/text()" ).get ();
140
+ item .setSmsYwzy (sms_ywzy );
92
141
// 贮藏
93
142
String sms_zc = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[9]/p/text()" ).get ();
94
- String productImgText = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[10]/textarea/text()" ).get ();
95
- Html html = new Html (productImgText );
96
- Page page1 = new Page ();
97
- page1 .setHtml (html );
98
- List <String > all = page1 .getHtml ().xpath ("//*a/@href" ).all ();
99
- String sms_img = page .getHtml ().xpath ("//*[@id=\" guide\" ]/div[1]/dl/dd[11]/textarea/text()" ).get ();
100
- Html sms_img_html = new Html (sms_img );
101
- Elements a = sms_img_html .getDocument ().getElementsByTag ("a" );
102
- String href = a .get (0 ).attr ("href" );
103
- System .out .println (sms_blfy );
143
+ item .setSmsZc (sms_zc );
144
+ int insert = yfwProductInfoDao .insert (item );
145
+ log .info ("【SUB_CAT_MED_DETAILS】页面: {}, 保存结果:{}" , page .getUrl ().get (), insert );
104
146
} else {
147
+ log .info ("【INDEX】页面: {}, 子页面数:{}" , page .getUrl ().get (), catList .size ());
105
148
page .addTargetRequests (catList );
106
149
}
107
150
}
0 commit comments