Skip to content

Commit 07466e1

Browse files
committed
🎨 优化多线程下载
优化输出,下载,兼容性 多线程下载存在成功下载一定量图片之后会停止运行,不知道原因
1 parent a6d4eb7 commit 07466e1

File tree

3 files changed

+32
-27
lines changed

3 files changed

+32
-27
lines changed

Config.php

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@
3939
define("SPIDERWAIT_TIME_MIN", "1");//最早等待
4040
define('SPIDERWAIT_TIME_MAX', "10");//最大等待
4141
//artatstion执行一次循环,睡一会觉觉(误)
42-
define("ARTSTATION_SLEEP", "true");
43-
define("ARTSTATION_SLEEP_TIME_MIN", "10");//最小等待
44-
define("ARTSTATION_SLEEP_TIME_MAX", "20");//最大等待
42+
define("ARTSTATION_SLEEP", false);
43+
define("ARTSTATION_SLEEP_TIME_MIN", "1");//最小等待
44+
define("ARTSTATION_SLEEP_TIME_MAX", "10");//最大等待
4545
//bilibili执行一次循环, 就-1s
46-
define("BILIBILI_SLEEP", "true");
47-
define("BILIBILI_SLEEP_TIME_MIN", "10");//最小等待
48-
define("BILIBILI_SLEEP_TIME_MAX", "40");//最大等待
46+
define("BILIBILI_SLEEP", false);
47+
define("BILIBILI_SLEEP_TIME_MIN", "1");//最小等待
48+
define("BILIBILI_SLEEP_TIME_MAX", "10");//最大等待
4949

5050
//数据库链接
5151
define("SAVE_IMAGES_URL_DATABASE", "false");//是否记录图片链接到数据库
@@ -57,5 +57,5 @@
5757

5858
define("SPIDER_LOG", true);//是否使用爬取记录
5959

60-
define('CURL_DOWN_OPT',false);//是否启用多线程下载
61-
define("CURL_DOWN_NUM",8);//最多多少个同时下载
60+
define('CURL_DOWN_OPT', false);//是否启用多线程下载
61+
define("CURL_DOWN_NUM", 8);//最多多少个同时下载

Src/PublicCore.php

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -231,54 +231,58 @@ public function spider_wait($status = true, $min = SPIDERWAIT_TIME_MIN, $max = S
231231
sleep($num);
232232
}
233233
}
234-
234+
235235
/**
236236
* 一个下载调用其他函数的封装
237237
* @param $string
238238
* @param $images_arr
239239
*/
240240
public function quick_down_img($string, $images_arr, $spider_name, $filename_data = null)
241241
{
242-
if (CURL_DOWN_OPT) { //选择下载模式
243-
$this->curl_multi_down_images($string, $images_arr, $spider_name, $filename_data);
242+
$dir_path = $this->new_dir_name($string);//生成保存路径
243+
if (CURL_DOWN_OPT) { //下载图片
244+
$this->curl_multi_down_images($images_arr, $dir_path, $spider_name, $filename_data);
244245
} else {
245-
$dir_path = $this->new_dir_name($string);//生成保存路径
246-
$this->image_save($images_arr, $dir_path, $spider_name, $filename_data);//下载图片
247-
print_r("文件夹现在有:" . $this->images_number($dir_path) . "张图片");
246+
$this->image_save($images_arr, $dir_path, $spider_name, $filename_data);
248247
}
248+
print_r("文件夹现在有:" . $this->images_number($dir_path) . "张图片");
249249
}
250250

251251
/**
252-
* CURL多线程下载 
252+
* 多线程图片下载
253+
* @param $images_arr
254+
* @param $dir_path
255+
* @param $spider_name
256+
* @param null $filename_data
253257
*/
254-
public function curl_multi_down_images($string, $images_arr, $spider_name, $filename_data = null)
258+
public function curl_multi_down_images($images_arr, $dir_path, $spider_name, $filename_data = null)
255259
{
256-
$dir_path = $this->new_dir_name($string);//生成保存路径
257260
for ($c = 1; $c <= count($images_arr); $c += CURL_DOWN_NUM) {//开始循环
258-
259261
$mh = curl_multi_init();//初始化 
260262
$arr = array_slice($images_arr, $c - 1, CURL_DOWN_NUM);
261263
$conn = [];
262264

263-
foreach ($arr as $title => $url) {
265+
foreach ($arr as $item) {
266+
$title = array_keys($item)[0];
264267
if (file_exists($dir_path . DIRECTORY_SEPARATOR . $title)) {//检测是否存在
265268
echo "已存在" . PHP_EOL;
266269
continue;
267270
}
268271
$conn[$title] = curl_init();
269-
curl_setopt($conn[$title], CURLOPT_URL, $url);
272+
curl_setopt($conn[$title], CURLOPT_URL, $item[$title]);
273+
curl_setopt($conn[$title], CURLOPT_TIMEOUT, 30);
270274
curl_setopt($conn[$title], CURLOPT_RETURNTRANSFER, 1);
271275
curl_multi_add_handle($mh, $conn[$title]);
276+
echo $title . PHP_EOL;
272277
}
278+
273279
// 执行批处理句柄
274280
$active = null;
275281
do {
276282
curl_multi_exec($mh, $active); //执行批处理句柄
277283
} while ($active > 0); //4
278-
279284
foreach ($conn as $title => $url) {
280285
$res[$title] = curl_multi_getcontent($conn[$title]);
281-
curl_close($conn['title']);
282286
curl_multi_remove_handle($mh, $conn[$title]);//释放资源
283287
}
284288

@@ -288,7 +292,6 @@ public function curl_multi_down_images($string, $images_arr, $spider_name, $file
288292
file_put_contents($dir_path . DIRECTORY_SEPARATOR . $title, $item);
289293
$this->add_log($spider_name, $title . PHP_EOL, $filename_data);
290294
}
291-
print_r("文件夹现在有:" . $this->images_number($dir_path) . "张图片" . PHP_EOL);
292295
}
293296
}
294297

@@ -323,4 +326,4 @@ public function bMenu($string, $spiderName)
323326
}
324327

325328
public $splitLine = "<<<<<<<<<<<<<<<<<<================================>>>>>>>>>>>>>>>>>>";
326-
}
329+
}

Src/Spider/Artstation.php

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public function get_img_url($result, $spiderCore)
4242
// $images_url = str_replace("medium", "large", $images_url);//生成更高清的图片下载地址
4343
// $images_url=str_replace('smaller_square','large',$images_url);
4444
// preg_replace("/201.[0-9]{3,}/",'',$images_url);
45-
$rep = substr_replace($images_url,'large',63,29);
45+
$images_url = substr_replace($images_url,'large',63,29);
4646
$file_name = $value->title . "-" . $value->cover_asset_id;//生成图片名
4747
$file_name = $spiderCore->image_url_format($images_url, $file_name);
4848
array_push($images_arr, [$file_name => $images_url]);
@@ -62,12 +62,14 @@ public function index_spider_core($spiderCore, $spider_name, $parm)
6262
{
6363
$posts_num = $spiderCore->user_input("请输入爬取页数(1页=50个作品)(默认为:1):", 1);
6464
for ($start_num = 1; $start_num <= $posts_num; $start_num++) {
65-
$url = "https://www.artstation.com/projects.json?page=" . $start_num . $parm;
65+
$url = "https://www.artstation.com/projects.json?medium=digital2d&page=" . $start_num . $parm;
6666
$result = $spiderCore->curl_get($url, $this->userAgent);
6767
$result = json_decode($result);
6868
$images_arr = $this->get_img_url($result, $spiderCore);
69-
69+
// var_dump($images_arr);
70+
// die();
7071
$spiderCore->quick_down_img($this->spider_name . "-" . $spider_name, $images_arr, "Artstation");
72+
// $spiderCore->curl_multi_down_images($spider_name, $images_arr, "Artstation");
7173
$spiderCore->spider_wait(ARTSTATION_SLEEP, ARTSTATION_SLEEP_TIME_MIN, ARTSTATION_SLEEP_TIME_MAX);
7274
}
7375

0 commit comments

Comments
 (0)