-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
260 additions
and
8 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
package com.gl.jcrawler; | ||
|
||
public interface Filter { | ||
boolean accept(String url); | ||
} |
39 changes: 39 additions & 0 deletions
39
jcrawler/src/main/java/com/gl/jcrawler/HtmlContentParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package com.gl.jcrawler; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.net.URLConnection; | ||
|
||
public class HtmlContentParser { | ||
public static final String ECODING = "UTF-8"; | ||
private String url; | ||
|
||
public HtmlContentParser(String url) { | ||
super(); | ||
this.url = url; | ||
} | ||
|
||
public String getHtml() { | ||
URL uri; | ||
try { | ||
uri = new URL(url); | ||
|
||
URLConnection connection = uri.openConnection(); | ||
InputStream in = connection.getInputStream(); | ||
byte[] buf = new byte[1024]; | ||
int length = 0; | ||
StringBuffer sb = new StringBuffer(); | ||
while ((length = in.read(buf, 0, buf.length)) > 0) { | ||
// TODO | ||
sb.append(new String(buf, ECODING)); | ||
} | ||
in.close(); | ||
return sb.toString(); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
return ""; | ||
} | ||
} | ||
} |
76 changes: 76 additions & 0 deletions
76
jcrawler/src/main/java/com/gl/jcrawler/HttpLinkParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
package com.gl.jcrawler; | ||
|
||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
import org.htmlparser.Node; | ||
import org.htmlparser.NodeFilter; | ||
import org.htmlparser.Parser; | ||
import org.htmlparser.filters.NodeClassFilter; | ||
import org.htmlparser.filters.OrFilter; | ||
import org.htmlparser.tags.FrameTag; | ||
import org.htmlparser.tags.LinkTag; | ||
import org.htmlparser.util.NodeList; | ||
import org.htmlparser.util.ParserException; | ||
|
||
public class HttpLinkParser { | ||
private Set<String> links = new HashSet<String>(); | ||
private String url; | ||
private Filter filter; | ||
|
||
public HttpLinkParser(String url, Filter filter) { | ||
super(); | ||
this.url = url; | ||
this.filter = filter; | ||
} | ||
|
||
public HttpLinkParser parse(){ | ||
NodeFilter frameFilter = new NodeFilter() { | ||
|
||
private static final long serialVersionUID = 1L; | ||
|
||
public boolean accept(Node node) { | ||
if (node.getText().startsWith("frame") || node.getText().startsWith("FRAME")) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
}; | ||
|
||
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); | ||
|
||
Parser parser; | ||
try { | ||
parser = new Parser(url); | ||
NodeList nodeList = parser.extractAllNodesThatMatch(linkFilter); | ||
for (int i = 0; i < nodeList.size(); i++) { | ||
Node tag = nodeList.elementAt(i); | ||
|
||
if(tag instanceof LinkTag){ | ||
LinkTag linkTag = (LinkTag)tag; | ||
String link = linkTag.getLink(); | ||
if(filter.accept(link)){ | ||
links.add(link); | ||
} | ||
}else if(tag instanceof FrameTag){ | ||
FrameTag frameTag = (FrameTag)tag; | ||
String link = frameTag.getAttribute("src"); | ||
if(filter.accept(link)){ | ||
links.add(link); | ||
} | ||
} | ||
} | ||
} catch (ParserException e) { | ||
e.printStackTrace(); | ||
} | ||
|
||
return this; | ||
} | ||
|
||
public Set<String> results() { | ||
return Collections.unmodifiableSet(links); | ||
} | ||
|
||
} |
This file was deleted.
Oops, something went wrong.
16 changes: 14 additions & 2 deletions
16
jcrawler/src/main/java/com/gl/jcrawler/ImageCrawlerBootStrap.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,22 @@ | ||
package com.gl.jcrawler; | ||
|
||
import java.io.IOException; | ||
|
||
import org.htmlparser.util.ParserException; | ||
|
||
public class ImageCrawlerBootStrap { | ||
|
||
public static void main(String[] args) { | ||
// TODO Auto-generated method stub | ||
|
||
String initUrl = "http://jshop.ofmall.org:81/jshop"; | ||
Jcrawler j = new Jcrawler(initUrl); | ||
|
||
try { | ||
j.crawl(); | ||
} catch (ParserException e) { | ||
e.printStackTrace(); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
} | ||
|
||
} |
41 changes: 41 additions & 0 deletions
41
jcrawler/src/main/java/com/gl/jcrawler/ImageDownloader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package com.gl.jcrawler; | ||
|
||
import java.io.File; | ||
import java.io.FileOutputStream; | ||
import java.io.InputStream; | ||
import java.net.URL; | ||
import java.util.List; | ||
|
||
public class ImageDownloader { | ||
private List<String> listImgSrc; | ||
|
||
|
||
public ImageDownloader(List<String> listImgSrc) { | ||
super(); | ||
this.listImgSrc = listImgSrc; | ||
} | ||
|
||
|
||
public void download(){ | ||
for (String url : listImgSrc) { | ||
try { | ||
String imageName = url.substring(url.lastIndexOf("/") + 1, | ||
url.length()); | ||
|
||
URL uri = new URL(url); | ||
InputStream in = uri.openStream(); | ||
FileOutputStream fo = new FileOutputStream(new File(imageName)); | ||
byte[] buf = new byte[1024]; | ||
int length = 0; | ||
while ((length = in.read(buf, 0, buf.length)) != -1) { | ||
fo.write(buf, 0, length); | ||
} | ||
in.close(); | ||
fo.close(); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
} | ||
} | ||
|
||
} | ||
} |
33 changes: 33 additions & 0 deletions
33
jcrawler/src/main/java/com/gl/jcrawler/ImageUrlParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
package com.gl.jcrawler; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
public class ImageUrlParser { | ||
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>"; | ||
private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)"; | ||
|
||
public List<String> getImageSrc(String html) { | ||
List<String> listImageUrl = getImageUrl(html); | ||
List<String> listImgSrc = new ArrayList<String>(); | ||
for (String image : listImageUrl) { | ||
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image); | ||
while (matcher.find()) { | ||
listImgSrc.add(matcher.group().substring(0, | ||
matcher.group().length() - 1)); | ||
} | ||
} | ||
return listImgSrc; | ||
} | ||
|
||
private List<String> getImageUrl(String html) { | ||
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(html); | ||
List<String> listImgUrl = new ArrayList<String>(); | ||
while (matcher.find()) { | ||
listImgUrl.add(matcher.group()); | ||
} | ||
return listImgUrl; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package com.gl.jcrawler; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import org.htmlparser.util.ParserException; | ||
|
||
public class Jcrawler { | ||
private String initUrl; | ||
private LinkQueue linkQueue = new LinkQueue(); | ||
|
||
public Jcrawler(String initUrl) { | ||
super(); | ||
this.initUrl = initUrl; | ||
}; | ||
|
||
public void crawl() throws IOException, ParserException { | ||
Filter filter = new Filter() { | ||
public boolean accept(String url) { | ||
if (url.indexOf("http://jshop.ofmall.org:81") != -1 | ||
|| url.indexOf("http://jshop.ofmall.org:81/jshop") != -1) { | ||
return true; | ||
} else { | ||
return false; | ||
} | ||
} | ||
}; | ||
|
||
linkQueue.addUnvisitedUrls(initUrl); | ||
|
||
while (!linkQueue.isUnvisitedUrlsEmpty()) { | ||
// 队头URL出队列 | ||
String visitUrl = (String) linkQueue.popUnvisitedUrls(); | ||
if (visitUrl == null){ | ||
continue; | ||
} | ||
|
||
String html = new HtmlContentParser(visitUrl).getHtml(); | ||
List<String> imageUrls = new ImageUrlParser().getImageSrc(html); | ||
new ImageDownloader(imageUrls).download(); | ||
|
||
|
||
Set<String> links = new HttpLinkParser(visitUrl, filter).parse().results(); | ||
for (String link : links) { | ||
linkQueue.addUnvisitedUrls(link); | ||
} | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters