-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
21 changed files
with
609 additions
and
1 deletion.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,4 @@ | ||
# crawlers | ||
# crawlers | ||
|
||
使用go语言 学习开发的爬虫 | ||
用于学习go语言,docker/dockerCompose |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
FROM golang:1.14 AS BUILD | ||
|
||
WORKDIR $GOPATH/src/crawlers/crawler-core | ||
ADD . $GOPATH/src/crawlers/crawler-core | ||
|
||
RUN go build crawler.go | ||
RUN rm -rf "$GOPATH/src/github.com/" | ||
CMD ["./crawler"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
package constant | ||
|
||
const Ershoufang_beike = "https://bj.ke.com/ershoufang/pg" | ||
const Zufang_beike = "https://bj.zu.ke.com/zufang/pg" | ||
const Zufang_douban = "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package core | ||
|
||
import ( | ||
"crawlers/crawler-core/constant" | ||
"crawlers/crawler-core/dao" | ||
"crawlers/crawler-core/data" | ||
"crawlers/crawler-core/utils" | ||
"fmt" | ||
"github.com/PuerkitoBio/goquery" | ||
"log" | ||
"net/http" | ||
"strconv" | ||
) | ||
|
||
func ProcessBeiKeErShouFang() { | ||
storage := dao.ConnectMysql() | ||
for i := 1; i <= 3; i++ { | ||
url := constant.Ershoufang_beike + strconv.Itoa(i) | ||
res, err := http.Get(url) | ||
if err != nil { | ||
fmt.Printf("fail %+v", err) | ||
} | ||
if res.StatusCode != http.StatusOK { | ||
log.Fatal(res.StatusCode) | ||
} | ||
doc, err := goquery.NewDocumentFromReader(res.Body) | ||
var datas = make([]*data.ErShouFangInfo, 0) | ||
doc.Find(".sellListPage .content .leftContent").Find("div[data-component=list]").Find(".sellListContent .clear").Each(func(i int, selection *goquery.Selection) { | ||
clear := selection.Find("div[class=info\\ clear]") | ||
if clear.Nodes == nil { | ||
return | ||
} | ||
title := utils.ReplaceAllEmpty(clear.Find(".title").Find("a[class=VIEWDATA\\ CLICKDATA\\ maidian-detail]").Text()) | ||
link, _ := clear.Find(".title").Find("a[class=VIEWDATA\\ CLICKDATA\\ maidian-detail]").Attr("href") | ||
addressWithPrice := clear.Find(".address") | ||
address := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=flood]").Find("a").Text()) | ||
houseInfo := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=houseInfo]").Text()) | ||
totalPrice := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=priceInfo]").Find("div[class=totalPrice]").Text()) | ||
unitPrice := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=priceInfo]").Find("div[class=unitPrice]").Text()) | ||
info := &data.ErShouFangInfo{Title: title, Address: address, HouseInfo: houseInfo, TotalPrice: totalPrice, UnitPrice: unitPrice, Link: link} | ||
log.Printf("%+v", info) | ||
datas = append(datas, info) | ||
}) | ||
storage.AddErShouFangRecords(datas) | ||
} | ||
storage.Db.Close() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package core | ||
|
||
import ( | ||
"crawlers/crawler-core/constant" | ||
"crawlers/crawler-core/dao" | ||
"crawlers/crawler-core/data" | ||
"crawlers/crawler-core/utils" | ||
"fmt" | ||
"github.com/PuerkitoBio/goquery" | ||
"log" | ||
"net/http" | ||
"strconv" | ||
"strings" | ||
) | ||
|
||
func ProcessBeiKeZuFang() { | ||
storage := dao.ConnectMysql() | ||
for i := 1; i <= 5; i++ { | ||
url := constant.Zufang_beike + strconv.Itoa(i) | ||
res, err := http.Get(url) | ||
if err != nil { | ||
fmt.Printf("fail %+v", err) | ||
} | ||
if res.StatusCode != http.StatusOK { | ||
log.Fatal(res.StatusCode) | ||
} | ||
doc, err := goquery.NewDocumentFromReader(res.Body) | ||
var datas = make([]*data.ZuFangInfo, 0) | ||
doc.Find(".wrapper").Find("#content").Find(".content__list .content__list--item").Each(func(i int, selection *goquery.Selection) { | ||
title := utils.ReplaceAllEmpty(selection.Find(".content__list--item--main").Find("p[class=content__list--item--title\\ twoline]").Text()) | ||
houseInfo := utils.ReplaceAllEmpty(selection.Find(".content__list--item--main .content__list--item--des").Text()) | ||
address := strings.Split(houseInfo, "-")[0] | ||
link, _ := selection.Find(".content__list--item--main").Find("p[class=content__list--item--title\\ twoline]").Find("a").Attr("href") | ||
price := utils.ReplaceAllEmpty(selection.Find(".content__list--item--main .content__list--item-price").Text()) | ||
info := &data.ZuFangInfo{Title: title, Address: address, HouseInfo: houseInfo, Price: price, Link: "https://bj.zu.ke.com" + link} | ||
log.Printf("%+v", info) | ||
datas = append(datas, info) | ||
}) | ||
storage.AddZuFangRecords(datas) | ||
} | ||
storage.Db.Close() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
package core |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package main | ||
|
||
import ( | ||
"crawlers/crawler-core/core" | ||
"sync" | ||
) | ||
|
||
func main() { | ||
maxConcurrentCount := 2 | ||
wg := sync.WaitGroup{} | ||
wg.Add(maxConcurrentCount) | ||
go func() { | ||
defer wg.Done() | ||
core.ProcessBeiKeErShouFang() | ||
}() | ||
go func() { | ||
defer wg.Done() | ||
go core.ProcessBeiKeZuFang() | ||
}() | ||
wg.Wait() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package dao | ||
|
||
import ( | ||
"crawlers/crawler-core/data" | ||
_ "github.com/go-sql-driver/mysql" | ||
"github.com/jmoiron/sqlx" | ||
"log" | ||
"os" | ||
) | ||
|
||
type Storage struct { | ||
Db *sqlx.DB | ||
} | ||
|
||
func ConnectMysql() *Storage { | ||
account := os.Getenv("account") | ||
passwd := os.Getenv("passwd") | ||
url := os.Getenv("url") | ||
db, err := sqlx.Open("mysql", account+":"+passwd+"@tcp("+url+")/house?charset=utf8") | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
return &Storage{Db: db} | ||
} | ||
|
||
func (storage *Storage) AddErShouFangRecord(data *data.ErShouFangInfo) { | ||
_, err := storage.Db.Exec("insert into ershoufang(`title`,`address`,`houseInfo`,`totalPrice`,`unitPrice`) values(?,?,?,?,?)", data.Title, data.Address, data.HouseInfo, data.TotalPrice, data.UnitPrice) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
} | ||
|
||
func (storage *Storage) AddErShouFangRecords(datas []*data.ErShouFangInfo) { | ||
for i := 0; i < len(datas); i++ { | ||
record := datas[i] | ||
storage.AddErShouFangRecord(record) | ||
} | ||
} | ||
|
||
func (storage *Storage) AddZuFangRecord(data *data.ZuFangInfo) { | ||
_, err := storage.Db.Exec("insert into zufang(`title`,`address`,`houseInfo`,`price`,`link`) values(?,?,?,?,?)", data.Title, data.Address, data.HouseInfo, data.Price, data.Link) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
} | ||
|
||
func (storage *Storage) AddZuFangRecords(datas []*data.ZuFangInfo) { | ||
for i := 0; i < len(datas); i++ { | ||
record := datas[i] | ||
storage.AddZuFangRecord(record) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package data | ||
|
||
type ErShouFangInfo struct { | ||
Title string | ||
Address string | ||
HouseInfo string | ||
TotalPrice string | ||
UnitPrice string | ||
Link string | ||
} | ||
|
||
type ZuFangInfo struct { | ||
Title string | ||
Address string | ||
HouseInfo string | ||
Price string | ||
Link string | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
module crawlers/crawler-core | ||
|
||
go 1.14 | ||
|
||
require ( | ||
github.com/PuerkitoBio/goquery v1.5.1 | ||
github.com/go-sql-driver/mysql v1.5.0 | ||
github.com/jmoiron/sqlx v1.2.0 | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= | ||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= | ||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= | ||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= | ||
github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w= | ||
github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs= | ||
github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= | ||
github.com/jmoiron/sqlx v1.2.0 h1:41Ip0zITnmWNR/vHV+S4m+VoUivnWY5E4OJfLZjCJMA= | ||
github.com/jmoiron/sqlx v1.2.0/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks= | ||
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= | ||
github.com/mattn/go-sqlite3 v1.9.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= | ||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= | ||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= | ||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= | ||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= | ||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= | ||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package utils | ||
|
||
import "strings" | ||
|
||
func ReplaceAllEmpty(s string) string { | ||
return strings.ReplaceAll(strings.ReplaceAll(s, "\n", ""), " ", "") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
version: '3' | ||
services: | ||
crawlers: | ||
image: crawler:0.0.1 | ||
depends_on: | ||
- database | ||
entrypoint: "sleep 30s" | ||
environment: | ||
account: root | ||
passwd: 123456 | ||
url: "database:3307" | ||
networks: | ||
- crawlers_net | ||
database: | ||
image: crawler-mysql:0.0.1 | ||
ports: | ||
- "3307:3306" | ||
networks: | ||
- crawlers_net | ||
environment: | ||
MYSQL_ROOT_PASSWORD: "123456" | ||
MYSQL_USER: 'root' | ||
MYSQL_PASS: '123456' | ||
restart: on-failure | ||
networks: | ||
crawlers_net: | ||
external: | ||
name: crawlers_back |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
FROM mysql:5.7.26 | ||
|
||
ENV MYSQL_ALLOW_EMPTY_PASSWORD yes | ||
|
||
COPY setup.sh /mysql/setup.sh | ||
COPY init.sql /mysql/init.sql | ||
COPY privileges.sql /mysql/privileges.sql | ||
|
||
CMD ["sh", "/mysql/setup.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
SET NAMES 'utf8'; | ||
create database house character set utf8; | ||
use house; | ||
|
||
drop table if exists `ershoufang`; | ||
drop table if exists `zufang`; | ||
|
||
CREATE TABLE `ershoufang` ( | ||
`id` int(11) unsigned NOT NULL AUTO_INCREMENT, | ||
`title` varchar(255) DEFAULT NULL, | ||
`address` varchar(255) DEFAULT NULL, | ||
`houseInfo` varchar(255) DEFAULT NULL, | ||
`followInfo` varchar(255) DEFAULT NULL, | ||
`totalPrice` varchar(255) DEFAULT NULL, | ||
`unitPrice` varchar(255) DEFAULT NULL, | ||
PRIMARY KEY (`id`) | ||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; | ||
|
||
CREATE TABLE `zufang` ( | ||
`id` int(11) unsigned NOT NULL AUTO_INCREMENT, | ||
`title` varchar(255) DEFAULT NULL, | ||
`address` varchar(255) DEFAULT NULL, | ||
`houseInfo` varchar(255) DEFAULT NULL, | ||
`price` varchar(255) DEFAULT NULL, | ||
`link` varchar(2048) DEFAULT NULL, | ||
PRIMARY KEY (`id`) | ||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
use mysql; | ||
select host, user from user; | ||
-- 因为mysql版本是5.7,因此新建用户为如下命令: | ||
-- create user root identified by '123456'; | ||
grant all on house.* to root@'%' identified by '123456' with grant option; | ||
-- 这一条命令一定要有: | ||
flush privileges; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
#查看mysql服务的状态,方便调试,这条语句可以删除 | ||
echo `service mysql status` | ||
|
||
echo '1.启动mysql....' | ||
#启动mysql | ||
service mysql start | ||
sleep 3 | ||
echo `service mysql status` | ||
|
||
echo '2.开始导入数据....' | ||
#导入数据 | ||
mysql < /mysql/init.sql | ||
echo '3.导入数据完毕....' | ||
|
||
sleep 3 | ||
echo `service mysql status` | ||
|
||
#重新设置mysql密码 | ||
echo '4.开始修改密码....' | ||
mysql < /mysql/privileges.sql | ||
echo '5.修改密码完毕....' | ||
|
||
#sleep 3 | ||
echo `service mysql status` | ||
echo 'mysql容器启动完毕,且数据导入成功' | ||
|
||
tail -f /dev/null |