Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
IanCao committed Mar 20, 2020
1 parent 13999b7 commit 6dc027b
Show file tree
Hide file tree
Showing 21 changed files with 609 additions and 1 deletion.
1 change: 1 addition & 0 deletions .idea/.name

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

262 changes: 262 additions & 0 deletions .idea/workspace.xml

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
# crawlers
# crawlers

使用go语言 学习开发的爬虫
用于学习go语言,docker/dockerCompose
8 changes: 8 additions & 0 deletions crawler-core/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM golang:1.14 AS BUILD

WORKDIR $GOPATH/src/crawlers/crawler-core
ADD . $GOPATH/src/crawlers/crawler-core

RUN go build crawler.go
RUN rm -rf "$GOPATH/src/github.com/"
CMD ["./crawler"]
5 changes: 5 additions & 0 deletions crawler-core/constant/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package constant

const Ershoufang_beike = "https://bj.ke.com/ershoufang/pg"
const Zufang_beike = "https://bj.zu.ke.com/zufang/pg"
const Zufang_douban = ""
47 changes: 47 additions & 0 deletions crawler-core/core/ershoufang_beike.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package core

import (
"crawlers/crawler-core/constant"
"crawlers/crawler-core/dao"
"crawlers/crawler-core/data"
"crawlers/crawler-core/utils"
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"net/http"
"strconv"
)

func ProcessBeiKeErShouFang() {
storage := dao.ConnectMysql()
for i := 1; i <= 3; i++ {
url := constant.Ershoufang_beike + strconv.Itoa(i)
res, err := http.Get(url)
if err != nil {
fmt.Printf("fail %+v", err)
}
if res.StatusCode != http.StatusOK {
log.Fatal(res.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
var datas = make([]*data.ErShouFangInfo, 0)
doc.Find(".sellListPage .content .leftContent").Find("div[data-component=list]").Find(".sellListContent .clear").Each(func(i int, selection *goquery.Selection) {
clear := selection.Find("div[class=info\\ clear]")
if clear.Nodes == nil {
return
}
title := utils.ReplaceAllEmpty(clear.Find(".title").Find("a[class=VIEWDATA\\ CLICKDATA\\ maidian-detail]").Text())
link, _ := clear.Find(".title").Find("a[class=VIEWDATA\\ CLICKDATA\\ maidian-detail]").Attr("href")
addressWithPrice := clear.Find(".address")
address := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=flood]").Find("a").Text())
houseInfo := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=houseInfo]").Text())
totalPrice := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=priceInfo]").Find("div[class=totalPrice]").Text())
unitPrice := utils.ReplaceAllEmpty(addressWithPrice.Find("div[class=priceInfo]").Find("div[class=unitPrice]").Text())
info := &data.ErShouFangInfo{Title: title, Address: address, HouseInfo: houseInfo, TotalPrice: totalPrice, UnitPrice: unitPrice, Link: link}
log.Printf("%+v", info)
datas = append(datas, info)
})
storage.AddErShouFangRecords(datas)
}
storage.Db.Close()
}
42 changes: 42 additions & 0 deletions crawler-core/core/zufang_beike.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package core

import (
"crawlers/crawler-core/constant"
"crawlers/crawler-core/dao"
"crawlers/crawler-core/data"
"crawlers/crawler-core/utils"
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"net/http"
"strconv"
"strings"
)

func ProcessBeiKeZuFang() {
storage := dao.ConnectMysql()
for i := 1; i <= 5; i++ {
url := constant.Zufang_beike + strconv.Itoa(i)
res, err := http.Get(url)
if err != nil {
fmt.Printf("fail %+v", err)
}
if res.StatusCode != http.StatusOK {
log.Fatal(res.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
var datas = make([]*data.ZuFangInfo, 0)
doc.Find(".wrapper").Find("#content").Find(".content__list .content__list--item").Each(func(i int, selection *goquery.Selection) {
title := utils.ReplaceAllEmpty(selection.Find(".content__list--item--main").Find("p[class=content__list--item--title\\ twoline]").Text())
houseInfo := utils.ReplaceAllEmpty(selection.Find(".content__list--item--main .content__list--item--des").Text())
address := strings.Split(houseInfo, "-")[0]
link, _ := selection.Find(".content__list--item--main").Find("p[class=content__list--item--title\\ twoline]").Find("a").Attr("href")
price := utils.ReplaceAllEmpty(selection.Find(".content__list--item--main .content__list--item-price").Text())
info := &data.ZuFangInfo{Title: title, Address: address, HouseInfo: houseInfo, Price: price, Link: "https://bj.zu.ke.com" + link}
log.Printf("%+v", info)
datas = append(datas, info)
})
storage.AddZuFangRecords(datas)
}
storage.Db.Close()
}
1 change: 1 addition & 0 deletions crawler-core/core/zufang_douban.go
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package core
21 changes: 21 additions & 0 deletions crawler-core/crawler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package main

import (
"crawlers/crawler-core/core"
"sync"
)

func main() {
maxConcurrentCount := 2
wg := sync.WaitGroup{}
wg.Add(maxConcurrentCount)
go func() {
defer wg.Done()
core.ProcessBeiKeErShouFang()
}()
go func() {
defer wg.Done()
go core.ProcessBeiKeZuFang()
}()
wg.Wait()
}
52 changes: 52 additions & 0 deletions crawler-core/dao/Storage.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package dao

import (
"crawlers/crawler-core/data"
_ "github.com/go-sql-driver/mysql"
"github.com/jmoiron/sqlx"
"log"
"os"
)

type Storage struct {
Db *sqlx.DB
}

func ConnectMysql() *Storage {
account := os.Getenv("account")
passwd := os.Getenv("passwd")
url := os.Getenv("url")
db, err := sqlx.Open("mysql", account+":"+passwd+"@tcp("+url+")/house?charset=utf8")
if err != nil {
log.Fatal(err)
}
return &Storage{Db: db}
}

func (storage *Storage) AddErShouFangRecord(data *data.ErShouFangInfo) {
_, err := storage.Db.Exec("insert into ershoufang(`title`,`address`,`houseInfo`,`totalPrice`,`unitPrice`) values(?,?,?,?,?)", data.Title, data.Address, data.HouseInfo, data.TotalPrice, data.UnitPrice)
if err != nil {
log.Fatal(err)
}
}

func (storage *Storage) AddErShouFangRecords(datas []*data.ErShouFangInfo) {
for i := 0; i < len(datas); i++ {
record := datas[i]
storage.AddErShouFangRecord(record)
}
}

func (storage *Storage) AddZuFangRecord(data *data.ZuFangInfo) {
_, err := storage.Db.Exec("insert into zufang(`title`,`address`,`houseInfo`,`price`,`link`) values(?,?,?,?,?)", data.Title, data.Address, data.HouseInfo, data.Price, data.Link)
if err != nil {
log.Fatal(err)
}
}

func (storage *Storage) AddZuFangRecords(datas []*data.ZuFangInfo) {
for i := 0; i < len(datas); i++ {
record := datas[i]
storage.AddZuFangRecord(record)
}
}
18 changes: 18 additions & 0 deletions crawler-core/data/HouseInfo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package data

type ErShouFangInfo struct {
Title string
Address string
HouseInfo string
TotalPrice string
UnitPrice string
Link string
}

type ZuFangInfo struct {
Title string
Address string
HouseInfo string
Price string
Link string
}
9 changes: 9 additions & 0 deletions crawler-core/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module crawlers/crawler-core

go 1.14

require (
github.com/PuerkitoBio/goquery v1.5.1
github.com/go-sql-driver/mysql v1.5.0
github.com/jmoiron/sqlx v1.2.0
)
17 changes: 17 additions & 0 deletions crawler-core/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs=
github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
github.com/jmoiron/sqlx v1.2.0 h1:41Ip0zITnmWNR/vHV+S4m+VoUivnWY5E4OJfLZjCJMA=
github.com/jmoiron/sqlx v1.2.0/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks=
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/mattn/go-sqlite3 v1.9.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
7 changes: 7 additions & 0 deletions crawler-core/utils/StringUtils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package utils

import "strings"

func ReplaceAllEmpty(s string) string {
return strings.ReplaceAll(strings.ReplaceAll(s, "\n", ""), " ", "")
}
28 changes: 28 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
version: '3'
services:
crawlers:
image: crawler:0.0.1
depends_on:
- database
entrypoint: "sleep 30s"
environment:
account: root
passwd: 123456
url: "database:3307"
networks:
- crawlers_net
database:
image: crawler-mysql:0.0.1
ports:
- "3307:3306"
networks:
- crawlers_net
environment:
MYSQL_ROOT_PASSWORD: "123456"
MYSQL_USER: 'root'
MYSQL_PASS: '123456'
restart: on-failure
networks:
crawlers_net:
external:
name: crawlers_back
9 changes: 9 additions & 0 deletions mysql/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM mysql:5.7.26

ENV MYSQL_ALLOW_EMPTY_PASSWORD yes

COPY setup.sh /mysql/setup.sh
COPY init.sql /mysql/init.sql
COPY privileges.sql /mysql/privileges.sql

CMD ["sh", "/mysql/setup.sh"]
27 changes: 27 additions & 0 deletions mysql/init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
SET NAMES 'utf8';
create database house character set utf8;
use house;

drop table if exists `ershoufang`;
drop table if exists `zufang`;

CREATE TABLE `ershoufang` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL,
`address` varchar(255) DEFAULT NULL,
`houseInfo` varchar(255) DEFAULT NULL,
`followInfo` varchar(255) DEFAULT NULL,
`totalPrice` varchar(255) DEFAULT NULL,
`unitPrice` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

CREATE TABLE `zufang` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL,
`address` varchar(255) DEFAULT NULL,
`houseInfo` varchar(255) DEFAULT NULL,
`price` varchar(255) DEFAULT NULL,
`link` varchar(2048) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
7 changes: 7 additions & 0 deletions mysql/privileges.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use mysql;
select host, user from user;
-- 因为mysql版本是5.7,因此新建用户为如下命令:
-- create user root identified by '123456';
grant all on house.* to root@'%' identified by '123456' with grant option;
-- 这一条命令一定要有:
flush privileges;
30 changes: 30 additions & 0 deletions mysql/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
set -e

#查看mysql服务的状态,方便调试,这条语句可以删除
echo `service mysql status`

echo '1.启动mysql....'
#启动mysql
service mysql start
sleep 3
echo `service mysql status`

echo '2.开始导入数据....'
#导入数据
mysql < /mysql/init.sql
echo '3.导入数据完毕....'

sleep 3
echo `service mysql status`

#重新设置mysql密码
echo '4.开始修改密码....'
mysql < /mysql/privileges.sql
echo '5.修改密码完毕....'

#sleep 3
echo `service mysql status`
echo 'mysql容器启动完毕,且数据导入成功'

tail -f /dev/null

0 comments on commit 6dc027b

Please sign in to comment.