-
Notifications
You must be signed in to change notification settings - Fork 0
/
collect.php
116 lines (109 loc) · 3.31 KB
/
collect.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
<?php
/**
* 电子书单采集脚本
*
* 源自:http://www.xiaoshuotxt.org/
*
* @author weilong <github.com/wilon>
*/
use DiDom\Document;
// 设置 Exception,防止终断
error_reporting(E_ALL ^ E_WARNING ^ E_NOTICE);
set_error_handler(function ($errno, $errstr, $errfile, $errline) {
throw new Exception(
"errno: $errno "
. "errstr: $errstr "
. "errfile: $errfile "
. "errline: $errline "
);
});
// init
require_once __DIR__ . '/vendor/autoload.php';
// 设置
$baseUri = 'http://www.xiaoshuotxt.org/';
$logFile = path(__DIR__, 'error.log');
$resFile = path(__DIR__, 'result.log');
$fp = file($resFile);
$classifyAllArr = [
'mingzhu' => '文学名著',
'dangdai' => '现代小说',
'waiwen' => '世界名著',
'ertong' => '儿童文学',
'gudian' => '古典名著',
'sanwen' => '散文随笔',
'qingchun' => '青春校园',
'pinglun' => '文学评论',
'xuanhuan' => '玄幻仙侠',
'yanqing' => '言情小说',
'wuxia' => '武侠小说',
'chuanyue' => '穿越小说',
'xuanyi' => '侦探悬疑',
'kehuan' => '科幻小说',
'wangyou' => '网游小说',
'renwen' => '人文社科',
'zhuanji' => '人物传记',
'lishi' => '历史小说',
'junshi' => '军事小说',
'lizhi' => '励志书籍',
'shenghuo' => '生活科普',
];
$classifyArr = array_keys($classifyAllArr);
// result.log 最后一行做标记
$lastCk = $lastI = $lastDk = -1;
if (($c = count($fp)) > 1) {
$mark = $fp[$c-1];
list($_, $lastCk, $lastI, $lastDk) = @explode(' ', $mark);
}
// 采集
foreach ($classifyArr as $ck => $classify) {
if ($ck < $lastCk) continue;
$uri = path($baseUri, $classify);
for ($i = 1; $i < 9999; $i++) {
echo "Collect: ", $classify, $i, br();
if ($ck == $lastCk && $i < $lastI) {
echo ' continue;', br();
continue;
}
// url
if ($i == 1) {
$url = $uri;
} else {
$url = path($uri, "index_$i.html");
}
// 异常
try {
$indexDoc = new Document($url, true);
} catch(Exception $e) {
simpleLog($logFile, $url, $e->getMessage());
break;
}
// 解析页面
$divList = $indexDoc->find('#zuo .bbox');
echo " continue ";
foreach($divList as $dk => $div) {
if ($ck == $lastCk && $i == $lastI && $dk <= $lastDk) {
echo " $dk; ";
continue;
}
// 解析dom
$dom = $div->find('.bintro')[0];
try {
$bookName = $dom->find('h3 a')[0]->text();
$bookUri = $dom->find('h3 a')[0]->attr('href');
$bookUrl = path($baseUri, $bookUri);
$bookInfoP = $dom->find('.ex p');
$authorDom = $bookInfoP[count($bookInfoP) - 2];
$author = $authorDom->find('a')[0]->text();
} catch (Exception $e) {
simpleLog($logFile, $e->getMessage());
break;
}
// 存储
echo $newLine = $classifyAllArr[$classify] . ",$bookName,$author,$bookUrl" . br();
file_put_contents($resFile, $newLine, FILE_APPEND);
}
echo br();
// 避免采集频率太快
sleep(2);
}
}