-
Notifications
You must be signed in to change notification settings - Fork 29
/
spider-百科.java
121 lines (90 loc) · 3.3 KB
/
spider-百科.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
package segment;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Baike {
//?��????url
//�?�?请�?�??��??
public static void fetch(String url) throws IOException
{
HttpClient httpClient = new HttpClient();
GetMethod getMethod = new GetMethod(url);
int statusCode = httpClient.executeMethod(getMethod);
if(statusCode >= 200 && statusCode < 400)
{
String result;
result = getMethod.getResponseBodyAsString(); //getMethod.getRespon
String response = new String(getMethod.getResponseBodyAsString().getBytes("ISO-8859-1"),"UTF-8");
//打印返回的信息
System.out.println(response);
// System.out.println(result);
Document doc = Jsoup.parse(response);//result.toString());//(temp);
String title =doc.head().select("title").text(); //doc2.head().select("title").text();
System.out.println("title:"+title);
//抽取所有 dt,所有dd
Elements ListDiv = doc.getElementsByAttributeValue("class","basicInfo-item name");
for (Element element :ListDiv) {
System.out.println(element.html());}
Elements ListDiv2 = doc.getElementsByAttributeValue("class","basicInfo-item value");
for (Element element :ListDiv2) {
System.out.println(element.html());}
//提取正文
//parse(response);
System.out.println( TextExtract.parse(response));
// System.out.println("result"+result);
getMethod.releaseConnection();
//saveFile("D:\\�?�?天�?\\"+temp+".txt", result.toString());
}
}
public static String InputStream2String(InputStream in_st,String charset) throws IOException{
BufferedReader buff = new BufferedReader(new InputStreamReader(in_st, charset));
StringBuffer res = new StringBuffer();
String line = "";
while((line = buff.readLine()) != null){
res.append(line);
}
return res.toString();
}
public static String toGbkString(String s)
{
StringBuffer sb = new StringBuffer();
for(int i = 0; i < s.length(); i++)
{
char c = s.charAt(i);
if(c >= 0 && c <= 255){
sb.append(c);
}else{
byte[] b;
try{
b = String.valueOf(c).getBytes("UTF-8");
}catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
b = new byte[0];
}
for(int j = 0; j < b.length; j++){
int k = b[j];
if(k < 0)
k+=256;
sb.append("%"+Integer.toHexString(k).toUpperCase());
}
}
}
return sb.toString();
}
public static void main(String[] args) throws IOException
{
String key="大丽花";
String s=toGbkString(key);
System.out.println(s);
String url="http://baike.baidu.com/item/"+s;
fetch(url);
}
}