From 9111d3d2ea6bf57399b2c5b01f1ce7cbbf252077 Mon Sep 17 00:00:00 2001 From: gavin Date: Tue, 23 Aug 2016 15:08:41 +0800 Subject: [PATCH] jshop --- jcrawler/log.png | Bin 0 -> 8692 bytes .../src/main/java/com/gl/jcrawler/Filter.java | 5 ++ .../com/gl/jcrawler/HtmlContentParser.java | 39 +++++++++ .../java/com/gl/jcrawler/HttpLinkParser.java | 76 ++++++++++++++++++ .../java/com/gl/jcrawler/HttpUrlParser.java | 5 -- .../gl/jcrawler/ImageCrawlerBootStrap.java | 16 +++- .../java/com/gl/jcrawler/ImageDownloader.java | 41 ++++++++++ .../java/com/gl/jcrawler/ImageUrlParser.java | 33 ++++++++ .../main/java/com/gl/jcrawler/Jcrawler.java | 51 ++++++++++++ .../main/java/com/gl/jcrawler/LinkQueue.java | 2 +- 10 files changed, 260 insertions(+), 8 deletions(-) create mode 100644 jcrawler/log.png create mode 100644 jcrawler/src/main/java/com/gl/jcrawler/Filter.java create mode 100644 jcrawler/src/main/java/com/gl/jcrawler/HtmlContentParser.java create mode 100644 jcrawler/src/main/java/com/gl/jcrawler/HttpLinkParser.java delete mode 100644 jcrawler/src/main/java/com/gl/jcrawler/HttpUrlParser.java create mode 100644 jcrawler/src/main/java/com/gl/jcrawler/ImageDownloader.java create mode 100644 jcrawler/src/main/java/com/gl/jcrawler/ImageUrlParser.java create mode 100644 jcrawler/src/main/java/com/gl/jcrawler/Jcrawler.java diff --git a/jcrawler/log.png b/jcrawler/log.png new file mode 100644 index 0000000000000000000000000000000000000000..f385f1553a3c2b55dfa147109ca74742c05fc977 GIT binary patch literal 8692 zcmV!-9fcuppvhuZX?V?bv(Q*if){5kbKM7K#-V2_+$fBm_uEA%Or1sSrX4 zqz6cVG3eJnK38?7hmYnZJC0nSTx{w}SS*_q}i1-`@WA zx0kE0zPj9S!wu!#?|yfA|NGxxHf_^xsdmtA?;wJ&407FdrF{Fhm-6*rKOH~wGo>7Q zXg%kOE3T*qCVB6B-@BRBO*?QHq-&33k1gePw=3miKBkm!`leEDbDQZn?X;>zUh#@o zlsCNL4dos0c*jPId;mfy$EMxN7=%W^0JU9yx!Ek`=YM`;j-UIv`t_?{{p#}4m%g-I zdg-O*t#5tnW}^{#hqwD7HoK}_IV-tv~^7eD%=6Lb9GANH@F z{p@F#7rp33<)Vu&s^@MPWbYVbwGl3FeB&GI-*vxl7-SbtCh69!@lw9~yC>#&^rL6z zopQ=4`-MRe7U}Bnd2gQ+qI6v(Hc9=?cfPZJ-q#X4&-X0P?Ct8SOZm;;Eaf}Dvy`Vk zwSIN+#TS>?zy9@=N%rI%*)Yg%G0wF&y{VLY-m{bs`>;Nd<-`+9`Kg~O3tUyMyz6eyr_q&(!=YL+xXMDyqnIH5)rTp5jm2%Zp`y66*3Cuz8 ziJw@?cYH_v{=y3{tZ?MI>#o}(X**#qn5cZzM@<^=b3Uh(+un9bkeIys9WF1gdey5c z_P?bGvXA2vpEy19TfSv0V>tfxU+Za?U3OV{&U2o#VUXR-Af33*JhPO~{_L%j`GsEy zD9`MVqmMqioPPT0b%OU6V(qPKv)e!z#Ph{q77bCKb=Fzsb+3C}4^q-JqrFiu4yHKr z$Wngq_onN=;0sFmLh1Y4-uAXzl(_@%YWw3qF6HAsZtHmdk;C~ z3+P58x&FrbepBKV-hqajwcq-!IfMMjkHC5Lr`NpZHRXg8PFN`J^B25(&BookTN$Lg zbr)*BeDWtx6S+|R|RgLnoxhQ{}Q)dH%8xVJS-*Dy$hPXF;A z!zq_nM$!Ksy8#dX>azg_}vz4o=Qt+`?S9lqfkO8M>Io<5&;+622Wi03z^q5-AKV5(>@ zOyC~BclRAN4J!8ANm?2_8P=!+(1y5Ie#YkuJuCI;bRUDp;2 zzEFd!Qe$GYgAeXdXm4`Q`ftyF{^ZzT(|y^OZG8_6-x?QhG^#PA4ZYWmxwJP7vY%Rq zIN}{9?-|cn{`gs+wVc#n`lTr@;`1;5Vt2iQ2&6dT$%OWaeaTDK6FRN1Ah`Z)QF^e0 z<^rwO8Midc@y82{Oz!7JV{DVa2)M&UaOT&4eF+u+?9V0!;kHhD-t#8L`P|Q4-XApk zm0v04zy51!v+(4VQsLNn=bcvxc{@*2m->tIs+QZ1$?tsU?hV}1n*9a(IWWwD2D^3@ zgUtN)w8g#dwfu1bpSDacB75v(`zgFxyjlo!Gc-mBR!+VKmG zAg>GY^#*?I31&U&Q5~AN_KXG80EGII;v{OxU@$*X~Q5ppglT+ z{Pa&xYd^;J-~T;L9^zi(#Y)2*T&BD*4E0%J9_An!#~8qKpWCl}+S8s^V*#S_4eW0v z9>%R~VjeYr5#B<3Xk5;0pv&XV6|Y$bX)p-JSc__!Tbb)s%*6fmk;m-s|9-m1GoRVD zh+yS9EyBp}?c2U>-UqJdRU=*C5~QRZXU@CJtKh--KHt)@nhk^81n*VDD$|6u#ZbLH z?s3!i0&c5qp*8B!CU;yw)d_BuImV+wm*+2xS+SK18o`%`KYZ)CEk$JJ`u+XiZ%(Sy zx|5)5fYs|c9WaP=kH!!$c)<%QW1iS-nW}l>HGA@tm(P>x;yVIl(WLmtJscqrfzZcO zA0PbSiMiNTV_!b>Lnmm(F)%b9GWOK5#~xc@D}f<4!ilkgSUasfxI{sMG-(nfs7b^e znViMMRofxPZxZfKIHCS58q6Kqy-AGK5vq&_d|&zO)M{jRZ`QihjBowcQhxSlYYtdv zkRSZPdO8nQByDZS9k-OuqUoeIM#doS(}?`(Pk(wP#@?i}xbC0-c?|SS(9-ve`UVLT z&CgnnnI<7*KL7LQ{r{S;Ddm6vcd{=qNOOmSh3vN@7yWdAtQZ%gfLNZv7TFl-J863e z3|cif7~bW}zkI^ZX@0{%CYw7}@JlXHpu@{w-ie(zi*)Tw0=7KlA(PiD1jDn{o-fS; z!PdsJvnSi*2Yz5n@Q_WwNWUx+nFor840Bis`S{0A-(wI?i01iv?f~=3kN^0V^)A)c zVm$jEw5#2_s%krg-D4$b81pB7V&30x{^qja#N|1>YrKF9Z6ojMkV6isP6k0sW8hub zRo-P^XB=-BWN%%&@`bop;{g%{PP~HQY?A$tSh!dx!(sYP;H}vXv;gTx^3eucT%nj~ zWRQwpPOdQL<&D8=5HjZ=?o|e%xmFkiyaGo^+uEe#3&1gYfAv?F&&MF&^F8$=d#Se= z^N2f*y~Q5${UA6H@0R!pK_T;bHI)Q|uC`D4l*s{5N?nQh%!OTJ%)Gz{ms%qYECCaI-(0RF>fwhU-qYV_=gR9E*FX#m5~8DZ zYRj@-8YfcS@l{_n!4#ht3^+0foQ}wPqtVzsf_x)xr&)gZhvx$kcdYxzZeqi*S!h+T zsI{YxI%;Bi!BH^ti@$jOPG9mRQx?KtTd*C#xT_NLAW)2VV^UfSL}a^xQe*Pvf!ubK zL1wl`2xMV9Y+;b3m>`BU2Dt=A-;?&;jy<^=mJH?&JuqE}=dO31+?=3W7{u|P|5;D% z*MLTTU$tBEzk8;C&7NQ|(^?%0#_M+W*~@+}piBAdzn-G-X@B>3)43o1;V{=yGa_Q% zuJ#=w$NZ$dxI}GY7oZZz_P2jKrN0b~!+{sdHQdlO1xKtWIapIoPBMhpb8kN08|V9fl_)1<}Q#9P(urvxGqIb)h}-U)e#2gN?2L1`KX z?;M-#^e|ugEe2Bk7w1)bh6V+zHcM@H0fWq<=7`gTk*=jwFG__FnaGd$h_axF<;BUT z!Xz+0FB0N2;VmbtOY%{BfGpZrHpszN7#nwM;LfUHJ~o8!`ycwD<#)VgaUx{>%6Jbp zf_Q@%i8Pvg4?@PyA}qMI2d#M6VTV&2)qczP38tzTX{f0sIx9uT5G2~O3=tB=ZbQ#T-lO?UAX_g+>;J;^6jQLAG4@rx) zS$plb7I)}yEyqCI-G0b57%{=Qfq{FAAwL@6F|>0aLIOa<-tyrz?r4xI=iTETrF_VT zOs?etCeav29C1XAky!J>fR?q4kfp|86z|G=CJ&8+!r0NE2tHmY(CL~4SQB(@o;2Dh zDLUJwu51`&C!m$az?E~v#al{+5e!BXCXTT*M=tLWqvuPp7kIwd6^OJ!4Uhq>XdE{Y z14DVtZ5q-xrm)J<)%Rap((Ge9jMs@uKkQ+X*;1wxI?IRLUiFXCTlcI2Zv4y5J0!I**Aljvo*vgcy8rlrnfjKZASkhPE zMVoL2McCThIyT@fKQEC5^=v^c7tR;A-4>E-biKMr*{xHV%Cybv=uK29vgvqouacP#8pC z_q4ub!yx;qrJ~^2Vp||?gp^V`tMy}(gqOk&&WnaYgfs@2kkkl=NgtQe(nJUXL4y_u znsMqJ@I+1%?C!o{K1}O8=VM~m^C;!On2cnaj})fO+|OT!{vdY7^@MV+sTzj1^X~5H z9vCu_Z0G9B&B=W-UY^x+%T}6ac|QBY7#PpZI`kc85QJ{h5HhKSP!Jxe$*$PjML8); z8V!RW5fvjdNqZ*S@1PiBM+gX!q)#R+ZQ%MWDb0N`i|<{FcrYMukAqe~Tsbf%t&_Fr zp1)dnA;lrw%d`Def zvm+b820Y94v?Q&kKlj!~{Kq<4&UdUU$+KF_I@jbqa=m_F zIDNoW-eq6sJw=0gcg#R5I%qU-^5#Oh4TJ0>CfQ>hsy(z1q)dMZJ;%Nx##K)erpOj? z(J+C3H>E=SEzvJy6&rA@EWaCI)&82FTMdKE20&0x#9GRSBtC@ zTD}sPv>S1A*GLwAwR%_}6^VTyE;!m7S{SR(7$d?2YS;~9b8G$G8;}*=@N)U86~iZ1tuX$ZVfYz1#IRX>GC#S|)cqk-K4e z@6H7ZRVB4e|?289wQ`Xp-JB1Ps+TT*QOITU1E#HYh zUdbIP7UC4iN>W`02}5(aETeLGuBN;)})Vdju@=#2HpMQa*a_PgL=D` zm`RVio9ICzN}s{_phGVx*P8zm*bHNCw}&8t@{G~*w2I-~htP<3ECJWC#hbTXrNrFR zU(ZaA*Vw<+>+=3NFCM;i0&+_d<)8lPywc>*#J1dbr#x-@ok9A*C`Ho>CYuD&WI;oI zdcOIK+Rjn6S>kHdU6~T=1i1w0{63p}8p%q$fBeVE@NX?C-zINMDut?ok`N_Kw(MP# zkpSZC{#G6;#Ku|R8X3Dol=|A#)XgF-YePz@X(Zwfoyp{6SuEAEhxJY5KDE3CYMKf# zcz5s9u8+HRR@O-0TmMDkxo~F$>OK~4wZux#%g$D-gxCLfzlESkaqk3Fd-HyrCM_S$ zoO$l-9b{Z9O2M`q*T&0kNS`@l2*CV59;Ngk>`pjj_B(@g37Q&Dt@f@#O%755XSGR# zcV4SDl0?2SQL#$utk(0_l3HuI+X5%|!gRAlJejnza+LjI z3Mhz1T1#IdY#n-A6?yTn!BAxkW1{4ceshSlr3HzgkK4%2YWnJ*>WXD5qtsVS8XKjL zvYy8H)v0>J^0~?z@jzM*Y@H5BZB35i+1}4u8hvrNPd<-FTd7GHa>(pND$I^BNS{S3n|om(3}G?Z z*a!(L6q}G?xAnC#s;sQ;LqZ5-VN~Gi{qH~RkyxhX1*0M}k0g$nX2U z2_j3G$HZf^bQ(i)&1d;^n(Sp`oaP>b{jm;KCLLxhkoW)ANKfFbw`__-N;gPFjrYwyEgt&N(b}H0n!<0XyJbbB_a0XnW`$y_)R^(=0wDz681k$vLzb~T*gUm) zzVDjnuQz>Ym1ATflg^|2xificjg3NwHu~6)omZZ*T!Cgyv-SoP55+fTyIg^&W5Eg# zjdpI0CQb4wyG?d82HB~dcXuQg3t{#3(UTQBsZ)}ZLB1fUZhw5&clBZph_k~G5ND_# z#devM+{q3~AtI6toFctjOVVO*fAmMwb6B_(4(EWEJLKRSqOr$_;+j{lD!q& z_C69A*&-?M-E9oQg{EB&)>KEU6|U7TcLrNh;3M6 zTkQvH(i^;EYgr?ciX>e-skKE)m$EGVqtZQ`AHiU3nyo^59~tA=nh@jCRu>{d&I8&y zd1-wdY3KDh8(dCeXK%((r5D>kEHU>a6kE-KRM)35papLH zI2>-yx|mxXgTxjAwWv;mF_zEl#NfN^VX+Dm9{I?vH|ABs1c-XDYpOS7AdFd4J;da* zNm{XlUUmpHVuYrs0_fuz1MHykld4{uq7CdKjrJi?+i#sIqusLRJqB^pl$g!x#8iv# zMtAI%2C@3WEt*uJJN;+aH}4n%ead3&UXtQ?v~#l;FoMNmG)T65?$VhvLqQ44nSCei zNBylq^L9-Y_bkF6tGg_5>2k*|-oLTM{%CNFP=EsBJjgCriQDW%RDH*)nvhnB0b~VV z7AAGL5`q{DJEvlL2of_f6iI2uy!unC#ySIj+{%sQV_(3&AXSrKiL+4ir0dMRi#bTu zv3yV4Up(=X+IX}`T2fop#}@3QWqMsI#LRw5OOd(e?Xt2EyfP--u^q$5ZfW=F4ggSy z;Y~QSTr~vt)P+rQ)TsS<&RU@go;A;i4Kj;cgm4-5H0!XDksxh{$6&5^(P)REfM0ch zWQStRGi-I&85T5CM7zC9w)6JRiP0QzsV1vTLUNH5a4^VV!p@Fl%|<$7wLox!gihHX zP{np=h;`W8FEycRV<0Ob%3wuAaJYRA+ddnSs4207IkK2(%W6E35b?*&$z}*FNy-G6 zrbbsQiV&vjOoW(%+cgqam6HaE^dTDo@gDW~2`$HoFdDv`8J@|0YE|Z^P|Pzc!S&)6 zDfQDB7VILeQ!FmK=+;I%Tf?uY9ih|WIhf5oyZ0`|@7Qs?b3Ae?rEpSkJA22$JLOs4 zt!q*m$2^p2ual8scT3LNj@RZLcXc&U2(n{Wwg(0n;}@6$ZspSl0mkCb&2`DBb@T$y zU!o81Jh! zwrH}J*qCbXgaq55$8K_6({^17v)e#KOxI#V-o^WOK||B{y?Nxk+n9}zbdJs-1PWUa z(KnZHU|$zH0)OTP3^q^trpPuY$jT->o+l8DIXE{uWTerW(*$hG-YmOm>F=)FVJi7L zeWF^hs7p*4zvJG8Eigv<#Hc0tvu{Ri;E>?m(EOe?;jlgl<~#V8isw=!$-oKU`5p5x zB+raS%f5+m3v)?}IUv~P0#?3JX(8<%j||_47+z5Xp>3uw|GLAm{dH17=mbg=cN4& zD>Z6ok#Nvt@W+!>uGt$^4i94(40pvCM5H-d;p*BzJ$-AH*ugx+XyW^IVg_*i7<^-# zgMjPeGPD9HqHIs|S|KK`}3fmw~O^KWbk^8w8u-B4sND9Y9hLh8L6 zhcG+!yt++WjCj|ITBn&LMlp#u#y!$F1>}XrKhN96u!P&kw2iU(jTB8l45N{V%?=<1 z8MI;~U|N@&qW&pTy^JzW_QmL828%EnTR^O!K^qXqSX9Q&Gsn1sI0lV|>9SpQV=P*X zmhD;6;Ei)iS@w@Z!*~8R=K^R%v-LV-ag>a%@@!YP`fhO(gNDt^)zJWN^Va+? z&YRRI*hwbYox9wEE(obbY%{UPXVaXx7!f_=7UW_Sa_^F;jKd}|yFuul;F&ZDaKm|+ zj6unsAq_KeJU`CG0JNC<(m>#oEM)k6*d4cPHVoZ}2((C@uUV8`B}FMAk~JG7YA-o$ zcEhV??Th#4z8yS6#(^~N_Dt#;dv}@wR1?eLARr6|?;pl;EjZb2Gg0}riB7K{238GP z5wtSyBHLXHCJlvWPdZ7O7?6M{!=A^_{9tgwM045T)EOxv&Iw~ootMA*v(WCeYqp-= z$y!~zfgEja=gi!dR!qMo4om`(!0;!9=W<4()LP zP-o?QL=SpY&4vM+hC_@gp$a;rP(;@>ttzAE>EBtaQgqDUYR|!J;M#1HQ5t}&m4pVl z+-Id8F!!lzvm`qVfK%z_`)&JFj`h z`k0tQTC7F#Q(h^Kik-4YcFzVniD(~u@RaY$fD=W#xG z5BJwczv+{{WoffMBUH>DCx*jsagcat4Cvhgk^FAlGbiq8y!=nt(PsKiO=a zF9$4x%oZU@jSysWEGBA%RwmFu8bH#NE7(A$JSZkePL#hUAYKzhZo*Qg6yYP{T;m{> zvzifMWw%;b-K6JS6YLrO_dTXH$q3tIrZAho`imKYp|jWtBk7mx^~1T@nF{fHzTdoq z!)Nz&f7d2&sy1u%xp~X9=*(C-NPdi!T&A(^7)DL?NDSgP@2XG6(_impEc7ifCkC_k z@m~7t-dX*blpEjq?7NKb>YTXNyM%eX^LTzX2-4>*Yh(_jNP2UTOlcf&t8|7YDKJOU zEf5aLW@6Ws{U-%e7$M#h-zROJbaKQKW#AF8bCW%4va)r6`#O)tNRhYP-{JpkNTXi} z2-CU-(YZ$!XhlPM-%$c*^P6b1xRvu8!t@?_j{6f1Jtc?D!VYMGOi^>1HrmgaWq**d z>`+z&`fLnSCo6S~^A5%@i@3(Z*~VWV+|zjJr^ETK)7NPEROervlk9Jjf_$!Ve$MrI z-LnrU2InMJ!*duiz^7&bL_Q-nK2 zxLskoaSX>|eR+pynsx7ycvZ&Hd*uAUN#7Z#jPW=p=}*?>yT 0) { + // TODO + sb.append(new String(buf, ECODING)); + } + in.close(); + return sb.toString(); + } catch (Exception e) { + e.printStackTrace(); + return ""; + } + } +} diff --git a/jcrawler/src/main/java/com/gl/jcrawler/HttpLinkParser.java b/jcrawler/src/main/java/com/gl/jcrawler/HttpLinkParser.java new file mode 100644 index 0000000..e7e7dd6 --- /dev/null +++ b/jcrawler/src/main/java/com/gl/jcrawler/HttpLinkParser.java @@ -0,0 +1,76 @@ +package com.gl.jcrawler; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.htmlparser.Node; +import org.htmlparser.NodeFilter; +import org.htmlparser.Parser; +import org.htmlparser.filters.NodeClassFilter; +import org.htmlparser.filters.OrFilter; +import org.htmlparser.tags.FrameTag; +import org.htmlparser.tags.LinkTag; +import org.htmlparser.util.NodeList; +import org.htmlparser.util.ParserException; + +public class HttpLinkParser { + private Set links = new HashSet(); + private String url; + private Filter filter; + + public HttpLinkParser(String url, Filter filter) { + super(); + this.url = url; + this.filter = filter; + } + + public HttpLinkParser parse(){ + NodeFilter frameFilter = new NodeFilter() { + + private static final long serialVersionUID = 1L; + + public boolean accept(Node node) { + if (node.getText().startsWith("frame") || node.getText().startsWith("FRAME")) { + return true; + } + return false; + } + + }; + + OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); + + Parser parser; + try { + parser = new Parser(url); + NodeList nodeList = parser.extractAllNodesThatMatch(linkFilter); + for (int i = 0; i < nodeList.size(); i++) { + Node tag = nodeList.elementAt(i); + + if(tag instanceof LinkTag){ + LinkTag linkTag = (LinkTag)tag; + String link = linkTag.getLink(); + if(filter.accept(link)){ + links.add(link); + } + }else if(tag instanceof FrameTag){ + FrameTag frameTag = (FrameTag)tag; + String link = frameTag.getAttribute("src"); + if(filter.accept(link)){ + links.add(link); + } + } + } + } catch (ParserException e) { + e.printStackTrace(); + } + + return this; + } + + public Set results() { + return Collections.unmodifiableSet(links); + } + +} diff --git a/jcrawler/src/main/java/com/gl/jcrawler/HttpUrlParser.java b/jcrawler/src/main/java/com/gl/jcrawler/HttpUrlParser.java deleted file mode 100644 index 912573f..0000000 --- a/jcrawler/src/main/java/com/gl/jcrawler/HttpUrlParser.java +++ /dev/null @@ -1,5 +0,0 @@ -package com.gl.jcrawler; - -public class HttpUrlParser { - -} diff --git a/jcrawler/src/main/java/com/gl/jcrawler/ImageCrawlerBootStrap.java b/jcrawler/src/main/java/com/gl/jcrawler/ImageCrawlerBootStrap.java index a3c64f9..306b316 100644 --- a/jcrawler/src/main/java/com/gl/jcrawler/ImageCrawlerBootStrap.java +++ b/jcrawler/src/main/java/com/gl/jcrawler/ImageCrawlerBootStrap.java @@ -1,10 +1,22 @@ package com.gl.jcrawler; +import java.io.IOException; + +import org.htmlparser.util.ParserException; + public class ImageCrawlerBootStrap { public static void main(String[] args) { - // TODO Auto-generated method stub - + String initUrl = "http://jshop.ofmall.org:81/jshop"; + Jcrawler j = new Jcrawler(initUrl); + + try { + j.crawl(); + } catch (ParserException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } } } diff --git a/jcrawler/src/main/java/com/gl/jcrawler/ImageDownloader.java b/jcrawler/src/main/java/com/gl/jcrawler/ImageDownloader.java new file mode 100644 index 0000000..01c745b --- /dev/null +++ b/jcrawler/src/main/java/com/gl/jcrawler/ImageDownloader.java @@ -0,0 +1,41 @@ +package com.gl.jcrawler; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.net.URL; +import java.util.List; + +public class ImageDownloader { + private List listImgSrc; + + + public ImageDownloader(List listImgSrc) { + super(); + this.listImgSrc = listImgSrc; + } + + + public void download(){ + for (String url : listImgSrc) { + try { + String imageName = url.substring(url.lastIndexOf("/") + 1, + url.length()); + + URL uri = new URL(url); + InputStream in = uri.openStream(); + FileOutputStream fo = new FileOutputStream(new File(imageName)); + byte[] buf = new byte[1024]; + int length = 0; + while ((length = in.read(buf, 0, buf.length)) != -1) { + fo.write(buf, 0, length); + } + in.close(); + fo.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + } +} diff --git a/jcrawler/src/main/java/com/gl/jcrawler/ImageUrlParser.java b/jcrawler/src/main/java/com/gl/jcrawler/ImageUrlParser.java new file mode 100644 index 0000000..7fde4ed --- /dev/null +++ b/jcrawler/src/main/java/com/gl/jcrawler/ImageUrlParser.java @@ -0,0 +1,33 @@ +package com.gl.jcrawler; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ImageUrlParser { + private static final String IMGURL_REG = "]*?>"; + private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)"; + + public List getImageSrc(String html) { + List listImageUrl = getImageUrl(html); + List listImgSrc = new ArrayList(); + for (String image : listImageUrl) { + Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image); + while (matcher.find()) { + listImgSrc.add(matcher.group().substring(0, + matcher.group().length() - 1)); + } + } + return listImgSrc; + } + + private List getImageUrl(String html) { + Matcher matcher = Pattern.compile(IMGURL_REG).matcher(html); + List listImgUrl = new ArrayList(); + while (matcher.find()) { + listImgUrl.add(matcher.group()); + } + return listImgUrl; + } +} diff --git a/jcrawler/src/main/java/com/gl/jcrawler/Jcrawler.java b/jcrawler/src/main/java/com/gl/jcrawler/Jcrawler.java new file mode 100644 index 0000000..d484ef1 --- /dev/null +++ b/jcrawler/src/main/java/com/gl/jcrawler/Jcrawler.java @@ -0,0 +1,51 @@ +package com.gl.jcrawler; + +import java.io.IOException; +import java.util.List; +import java.util.Set; + +import org.htmlparser.util.ParserException; + +public class Jcrawler { + private String initUrl; + private LinkQueue linkQueue = new LinkQueue(); + + public Jcrawler(String initUrl) { + super(); + this.initUrl = initUrl; + }; + + public void crawl() throws IOException, ParserException { + Filter filter = new Filter() { + public boolean accept(String url) { + if (url.indexOf("http://jshop.ofmall.org:81") != -1 + || url.indexOf("http://jshop.ofmall.org:81/jshop") != -1) { + return true; + } else { + return false; + } + } + }; + + linkQueue.addUnvisitedUrls(initUrl); + + while (!linkQueue.isUnvisitedUrlsEmpty()) { + // 队头URL出队列 + String visitUrl = (String) linkQueue.popUnvisitedUrls(); + if (visitUrl == null){ + continue; + } + + String html = new HtmlContentParser(visitUrl).getHtml(); + List imageUrls = new ImageUrlParser().getImageSrc(html); + new ImageDownloader(imageUrls).download(); + + + Set links = new HttpLinkParser(visitUrl, filter).parse().results(); + for (String link : links) { + linkQueue.addUnvisitedUrls(link); + } + } + } + +} diff --git a/jcrawler/src/main/java/com/gl/jcrawler/LinkQueue.java b/jcrawler/src/main/java/com/gl/jcrawler/LinkQueue.java index 598f538..6a10799 100644 --- a/jcrawler/src/main/java/com/gl/jcrawler/LinkQueue.java +++ b/jcrawler/src/main/java/com/gl/jcrawler/LinkQueue.java @@ -21,7 +21,7 @@ public String popUnvisitedUrls(){ return null; } - public void addVisitedUrls(String url){ + public void addUnvisitedUrls(String url){ if(url == null){ return; }