-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathcrawl.php
executable file
·82 lines (62 loc) · 2.57 KB
/
crawl.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
<?php
// Define the URL
$url = "https://learn.lianglianglee.com";
# 1 获取文件主目录
// $response = file_get_contents($url);
// if ($response === FALSE) {
// echo "Failed to access the URL.";
// } else {
// // Use regex to find the href values across multiple lines (with the 's' modifier)
// preg_match_all('/<li><a href="([^"]*)">([^<]*)<\/a><\/li>/', $response, $matches);
// // Prepare the output for the readme file
// $output = "";
// if (!empty($matches[1])) {
// foreach ($matches[1] as $href) {
// $output .= $href.PHP_EOL;
// }
// } else {
// $output = "No match found.\n";
// }
// // Write the results to readme.txt file
// file_put_contents("README.md", $output);
// echo "Results saved to readme.txt";
// }
# 2 生成不同目录的文件夹
// Path to the README.md file
$readmePath = 'README.md';
// Read the file into an array of lines
$lines = file($readmePath, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach ($lines as $line) {
// Remove spaces from each line
$folderName = str_replace(' ', '', $line);
$folderName = "/Users/01397713/Documents/github/learn-tech".$folderName;
$line = str_replace(' ', '%20', $line);
$curlUrl = $url . $line;
$response = file_get_contents($curlUrl);
mkdir($folderName, 0777, true);
preg_match_all('/<a class="menu-item" id="([^"]*)" href="([^"]*)">([^<]*)<\/a>/', $response, $matches);
if (isset($matches[1])) {
$fileNameList = $matches[1];
$urlList = $matches[2];
foreach($fileNameList as $key => $name) {
$fileName = str_replace(' ', '', $name);
$fileName = $folderName . '/'. $fileName;
echo $fileName;
echo PHP_EOL;
$uri = str_replace(' ', '%20', html_entity_decode($name));
$fileUrl = $url . $line . '/' . $uri;
if(filesize($fileName) > 0) continue;
$fileContents = file_get_contents($fileUrl);
preg_match_all('/<div class="book-post">(.*?)<div id="prePage" style="float: left">/s', $fileContents, $divMatchs);
$a = '<meta charset="UTF-8">'.$divMatchs[1][0];
$doc = new DOMDocument();
libxml_use_internal_errors(true); // To handle any invalid HTML
$doc->loadHtml($a);
libxml_clear_errors();
$text = $doc->textContent;
file_put_contents($fileName, $text);
sleep(5);
// preg_match_all('/<p>([^<]*)<\/p>/', $fileContents, $fileMatches);
}
}
}