Skip to content

Commit

Permalink
jshop
Browse files Browse the repository at this point in the history
  • Loading branch information
gavin2lee committed Aug 23, 2016
1 parent 3df0b43 commit 9111d3d
Show file tree
Hide file tree
Showing 10 changed files with 260 additions and 8 deletions.
Binary file added jcrawler/log.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions jcrawler/src/main/java/com/gl/jcrawler/Filter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package com.gl.jcrawler;

public interface Filter {
boolean accept(String url);
}
39 changes: 39 additions & 0 deletions jcrawler/src/main/java/com/gl/jcrawler/HtmlContentParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.gl.jcrawler;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class HtmlContentParser {
public static final String ECODING = "UTF-8";
private String url;

public HtmlContentParser(String url) {
super();
this.url = url;
}

public String getHtml() {
URL uri;
try {
uri = new URL(url);

URLConnection connection = uri.openConnection();
InputStream in = connection.getInputStream();
byte[] buf = new byte[1024];
int length = 0;
StringBuffer sb = new StringBuffer();
while ((length = in.read(buf, 0, buf.length)) > 0) {
// TODO
sb.append(new String(buf, ECODING));
}
in.close();
return sb.toString();
} catch (Exception e) {
e.printStackTrace();
return "";
}
}
}
76 changes: 76 additions & 0 deletions jcrawler/src/main/java/com/gl/jcrawler/HttpLinkParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package com.gl.jcrawler;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class HttpLinkParser {
private Set<String> links = new HashSet<String>();
private String url;
private Filter filter;

public HttpLinkParser(String url, Filter filter) {
super();
this.url = url;
this.filter = filter;
}

public HttpLinkParser parse(){
NodeFilter frameFilter = new NodeFilter() {

private static final long serialVersionUID = 1L;

public boolean accept(Node node) {
if (node.getText().startsWith("frame") || node.getText().startsWith("FRAME")) {
return true;
}
return false;
}

};

OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);

Parser parser;
try {
parser = new Parser(url);
NodeList nodeList = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < nodeList.size(); i++) {
Node tag = nodeList.elementAt(i);

if(tag instanceof LinkTag){
LinkTag linkTag = (LinkTag)tag;
String link = linkTag.getLink();
if(filter.accept(link)){
links.add(link);
}
}else if(tag instanceof FrameTag){
FrameTag frameTag = (FrameTag)tag;
String link = frameTag.getAttribute("src");
if(filter.accept(link)){
links.add(link);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}

return this;
}

public Set<String> results() {
return Collections.unmodifiableSet(links);
}

}
5 changes: 0 additions & 5 deletions jcrawler/src/main/java/com/gl/jcrawler/HttpUrlParser.java

This file was deleted.

16 changes: 14 additions & 2 deletions jcrawler/src/main/java/com/gl/jcrawler/ImageCrawlerBootStrap.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
package com.gl.jcrawler;

import java.io.IOException;

import org.htmlparser.util.ParserException;

public class ImageCrawlerBootStrap {

public static void main(String[] args) {
// TODO Auto-generated method stub

String initUrl = "http://jshop.ofmall.org:81/jshop";
Jcrawler j = new Jcrawler(initUrl);

try {
j.crawl();
} catch (ParserException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

}
41 changes: 41 additions & 0 deletions jcrawler/src/main/java/com/gl/jcrawler/ImageDownloader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package com.gl.jcrawler;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.util.List;

public class ImageDownloader {
private List<String> listImgSrc;


public ImageDownloader(List<String> listImgSrc) {
super();
this.listImgSrc = listImgSrc;
}


public void download(){
for (String url : listImgSrc) {
try {
String imageName = url.substring(url.lastIndexOf("/") + 1,
url.length());

URL uri = new URL(url);
InputStream in = uri.openStream();
FileOutputStream fo = new FileOutputStream(new File(imageName));
byte[] buf = new byte[1024];
int length = 0;
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
in.close();
fo.close();
} catch (Exception e) {
e.printStackTrace();
}
}

}
}
33 changes: 33 additions & 0 deletions jcrawler/src/main/java/com/gl/jcrawler/ImageUrlParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package com.gl.jcrawler;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ImageUrlParser {
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)";

public List<String> getImageSrc(String html) {
List<String> listImageUrl = getImageUrl(html);
List<String> listImgSrc = new ArrayList<String>();
for (String image : listImageUrl) {
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find()) {
listImgSrc.add(matcher.group().substring(0,
matcher.group().length() - 1));
}
}
return listImgSrc;
}

private List<String> getImageUrl(String html) {
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(html);
List<String> listImgUrl = new ArrayList<String>();
while (matcher.find()) {
listImgUrl.add(matcher.group());
}
return listImgUrl;
}
}
51 changes: 51 additions & 0 deletions jcrawler/src/main/java/com/gl/jcrawler/Jcrawler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package com.gl.jcrawler;

import java.io.IOException;
import java.util.List;
import java.util.Set;

import org.htmlparser.util.ParserException;

public class Jcrawler {
private String initUrl;
private LinkQueue linkQueue = new LinkQueue();

public Jcrawler(String initUrl) {
super();
this.initUrl = initUrl;
};

public void crawl() throws IOException, ParserException {
Filter filter = new Filter() {
public boolean accept(String url) {
if (url.indexOf("http://jshop.ofmall.org:81") != -1
|| url.indexOf("http://jshop.ofmall.org:81/jshop") != -1) {
return true;
} else {
return false;
}
}
};

linkQueue.addUnvisitedUrls(initUrl);

while (!linkQueue.isUnvisitedUrlsEmpty()) {
// 队头URL出队列
String visitUrl = (String) linkQueue.popUnvisitedUrls();
if (visitUrl == null){
continue;
}

String html = new HtmlContentParser(visitUrl).getHtml();
List<String> imageUrls = new ImageUrlParser().getImageSrc(html);
new ImageDownloader(imageUrls).download();


Set<String> links = new HttpLinkParser(visitUrl, filter).parse().results();
for (String link : links) {
linkQueue.addUnvisitedUrls(link);
}
}
}

}
2 changes: 1 addition & 1 deletion jcrawler/src/main/java/com/gl/jcrawler/LinkQueue.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public String popUnvisitedUrls(){
return null;
}

public void addVisitedUrls(String url){
public void addUnvisitedUrls(String url){
if(url == null){
return;
}
Expand Down

0 comments on commit 9111d3d

Please sign in to comment.