add direct download
parent
86cfefb58c
commit
8f774afc84
|
@ -68,4 +68,13 @@ public class ResultItems {
|
|||
this.skip = skip;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ResultItems{" +
|
||||
"fields=" + fields +
|
||||
", request=" + request +
|
||||
", skip=" + skip +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,8 @@ public class Site {
|
|||
|
||||
private HttpHost httpProxy;
|
||||
|
||||
private boolean useGzip = true;
|
||||
|
||||
public static interface HeaderConst {
|
||||
|
||||
public static final String REFERER = "Referer";
|
||||
|
@ -199,7 +201,10 @@ public class Site {
|
|||
|
||||
/**
|
||||
* Add a url to start url.<br>
|
||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
|
||||
*
|
||||
* @deprecated
|
||||
* @see Spider#addUrl(String...)
|
||||
* @param startUrl
|
||||
* @return this
|
||||
*/
|
||||
|
@ -209,7 +214,10 @@ public class Site {
|
|||
|
||||
/**
|
||||
* Add a url to start url.<br>
|
||||
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
|
||||
*
|
||||
* @deprecated
|
||||
* @see Spider#addRequest(Request...)
|
||||
* @param startUrl
|
||||
* @return this
|
||||
*/
|
||||
|
@ -312,6 +320,22 @@ public class Site {
|
|||
return this;
|
||||
}
|
||||
|
||||
public boolean isUseGzip() {
|
||||
return useGzip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether use gzip. <br>
|
||||
* Default is true, you can set it to false to disable gzip.
|
||||
*
|
||||
* @param useGzip
|
||||
* @return
|
||||
*/
|
||||
public Site setUseGzip(boolean useGzip) {
|
||||
this.useGzip = useGzip;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Task toTask() {
|
||||
return new Task() {
|
||||
@Override
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
package us.codecraft.webmagic;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.downloader.Downloader;
|
||||
import us.codecraft.webmagic.downloader.HttpClientDownloader;
|
||||
import us.codecraft.webmagic.pipeline.CollectorPipeline;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.locks.Condition;
|
||||
|
@ -85,6 +89,10 @@ public class Spider implements Runnable, Task {
|
|||
|
||||
protected final static int STAT_STOPPED = 2;
|
||||
|
||||
protected boolean spawnUrl = true;
|
||||
|
||||
protected boolean destroyWhenExit = true;
|
||||
|
||||
private ReentrantLock newUrlLock = new ReentrantLock();
|
||||
|
||||
private Condition newUrlCondition = newUrlLock.newCondition();
|
||||
|
@ -244,7 +252,9 @@ public class Spider implements Runnable, Task {
|
|||
pipelines.add(new ConsolePipeline());
|
||||
}
|
||||
downloader.setThread(threadNum);
|
||||
executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
||||
if (executorService == null || executorService.isShutdown()) {
|
||||
executorService = ThreadUtils.newFixedThreadPool(threadNum);
|
||||
}
|
||||
if (startRequests != null) {
|
||||
for (Request request : startRequests) {
|
||||
scheduler.push(request, this);
|
||||
|
@ -285,10 +295,11 @@ public class Spider implements Runnable, Task {
|
|||
});
|
||||
}
|
||||
}
|
||||
executorService.shutdown();
|
||||
stat.set(STAT_STOPPED);
|
||||
// release some resources
|
||||
destroy();
|
||||
if (destroyWhenExit) {
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
||||
private void checkRunningStat() {
|
||||
|
@ -303,12 +314,13 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
}
|
||||
|
||||
protected void destroy() {
|
||||
public void close() {
|
||||
destroyEach(downloader);
|
||||
destroyEach(pageProcessor);
|
||||
for (Pipeline pipeline : pipelines) {
|
||||
destroyEach(pipeline);
|
||||
}
|
||||
executorService.shutdown();
|
||||
}
|
||||
|
||||
private void destroyEach(Object object) {
|
||||
|
@ -366,7 +378,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
|
||||
protected void extractAndAddRequests(Page page) {
|
||||
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
|
||||
for (Request request : page.getTargetRequests()) {
|
||||
addRequest(request);
|
||||
}
|
||||
|
@ -374,8 +386,10 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
|
||||
private void addRequest(Request request) {
|
||||
if (site.getDomain() == null && request != null && request.getUrl() != null) {
|
||||
site.setDomain(UrlUtils.getDomain(request.getUrl()));
|
||||
}
|
||||
scheduler.push(request, this);
|
||||
|
||||
}
|
||||
|
||||
protected void checkIfRunning() {
|
||||
|
@ -391,7 +405,7 @@ public class Spider implements Runnable, Task {
|
|||
}
|
||||
|
||||
/**
|
||||
* Add urls to crawl.<br/>
|
||||
* Add urls to crawl. <br/>
|
||||
*
|
||||
* @param urls
|
||||
* @return
|
||||
|
@ -404,6 +418,34 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download urls synchronizing.
|
||||
*
|
||||
* @param urls
|
||||
* @return
|
||||
*/
|
||||
public List<ResultItems> getAll(Collection<String> urls) {
|
||||
destroyWhenExit = false;
|
||||
spawnUrl = false;
|
||||
startRequests = UrlUtils.convertToRequests(urls);
|
||||
CollectorPipeline collectorPipeline = new CollectorPipeline();
|
||||
pipelines.add(collectorPipeline);
|
||||
run();
|
||||
spawnUrl = true;
|
||||
destroyWhenExit = true;
|
||||
return collectorPipeline.getCollector();
|
||||
}
|
||||
|
||||
public ResultItems get(String url) {
|
||||
List<String> urls = Lists.newArrayList(url);
|
||||
List<ResultItems> resultItemses = getAll(urls);
|
||||
if (resultItemses != null && resultItemses.size() > 0) {
|
||||
return resultItemses.get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add urls with information to crawl.<br/>
|
||||
*
|
||||
|
@ -492,6 +534,24 @@ public class Spider implements Runnable, Task {
|
|||
return this;
|
||||
}
|
||||
|
||||
public boolean isSpawnUrl() {
|
||||
return spawnUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether add urls extracted to download.<br>
|
||||
* Add urls to download when it is true, and just download seed urls when it is false. <br>
|
||||
* DO NOT set it unless you know what it means!
|
||||
*
|
||||
* @param spawnUrl
|
||||
* @return
|
||||
* @since 0.4.0
|
||||
*/
|
||||
public Spider setSpawnUrl(boolean spawnUrl) {
|
||||
this.spawnUrl = spawnUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getUUID() {
|
||||
if (uuid != null) {
|
||||
|
@ -500,7 +560,8 @@ public class Spider implements Runnable, Task {
|
|||
if (site != null) {
|
||||
return site.getDomain();
|
||||
}
|
||||
return null;
|
||||
uuid = UUID.randomUUID().toString();
|
||||
return uuid;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
package us.codecraft.webmagic.downloader;
|
||||
|
||||
import org.apache.http.*;
|
||||
import org.apache.http.HttpException;
|
||||
import org.apache.http.HttpRequest;
|
||||
import org.apache.http.HttpRequestInterceptor;
|
||||
import org.apache.http.client.CookieStore;
|
||||
import org.apache.http.client.entity.GzipDecompressingEntity;
|
||||
import org.apache.http.config.Registry;
|
||||
import org.apache.http.config.RegistryBuilder;
|
||||
import org.apache.http.conn.socket.ConnectionSocketFactory;
|
||||
|
@ -19,7 +20,7 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.3.3
|
||||
* @since 0.4.0
|
||||
*/
|
||||
public class HttpClientGenerator {
|
||||
|
||||
|
@ -46,42 +47,48 @@ public class HttpClientGenerator {
|
|||
} else {
|
||||
httpClientBuilder.setUserAgent("");
|
||||
}
|
||||
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
||||
if (site == null || site.isUseGzip()) {
|
||||
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
||||
|
||||
public void process(
|
||||
final HttpRequest request,
|
||||
final HttpContext context) throws HttpException, IOException {
|
||||
if (!request.containsHeader("Accept-Encoding")) {
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
}
|
||||
|
||||
}
|
||||
}).addInterceptorFirst(new HttpResponseInterceptor() {
|
||||
|
||||
public void process(
|
||||
final HttpResponse response,
|
||||
final HttpContext context) throws HttpException, IOException {
|
||||
HttpEntity entity = response.getEntity();
|
||||
if (entity != null) {
|
||||
Header ceheader = entity.getContentEncoding();
|
||||
if (ceheader != null) {
|
||||
HeaderElement[] codecs = ceheader.getElements();
|
||||
for (int i = 0; i < codecs.length; i++) {
|
||||
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
|
||||
response.setEntity(
|
||||
new GzipDecompressingEntity(response.getEntity()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
public void process(
|
||||
final HttpRequest request,
|
||||
final HttpContext context) throws HttpException, IOException {
|
||||
if (!request.containsHeader("Accept-Encoding")) {
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
});
|
||||
if (site!=null){
|
||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
|
||||
}
|
||||
});
|
||||
}
|
||||
generateCookie(httpClientBuilder,site);
|
||||
// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() {
|
||||
//
|
||||
// public void process(
|
||||
// final HttpResponse response,
|
||||
// final HttpContext context) throws HttpException, IOException {
|
||||
// if (response.getStatusLine().getStatusCode() != 200) {
|
||||
// return;
|
||||
// }
|
||||
// HttpEntity entity = response.getEntity();
|
||||
// if (entity != null) {
|
||||
// Header ceheader = entity.getContentEncoding();
|
||||
// if (ceheader != null) {
|
||||
// HeaderElement[] codecs = ceheader.getElements();
|
||||
// for (int i = 0; i < codecs.length; i++) {
|
||||
// if (codecs[i].getName().equalsIgnoreCase("gzip")) {
|
||||
// response.setEntity(
|
||||
// new GzipDecompressingEntity(response.getEntity()));
|
||||
// return;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// });
|
||||
if (site != null) {
|
||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
|
||||
}
|
||||
generateCookie(httpClientBuilder, site);
|
||||
return httpClientBuilder.build();
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.4.0
|
||||
*/
|
||||
public class CollectorPipeline implements Pipeline{
|
||||
|
||||
private List<ResultItems> collector = new ArrayList<ResultItems>();
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
collector.add(resultItems);
|
||||
}
|
||||
|
||||
public List<ResultItems> getCollector() {
|
||||
return collector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package us.codecraft.webmagic.processor.example;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.4.0
|
||||
*/
|
||||
public class BaiduBaikePageProcesser implements PageProcessor {
|
||||
|
||||
private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888))
|
||||
.setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString());
|
||||
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2);
|
||||
List<String> list = new ArrayList<String>();
|
||||
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
|
||||
list.add(String.format(urlTemplate,"水力发电"));
|
||||
list.add(String.format(urlTemplate,"风力发电"));
|
||||
list.add(String.format(urlTemplate,"太阳能"));
|
||||
list.add(String.format(urlTemplate,"地热发电"));
|
||||
list.add(String.format(urlTemplate,"众数"));
|
||||
list.add(String.format(urlTemplate,"地热发电"));
|
||||
List<ResultItems> resultItemses = spider.getAll(list);
|
||||
for (ResultItems resultItemse : resultItemses) {
|
||||
System.out.println(resultItemse.getAll());
|
||||
}
|
||||
spider.close();
|
||||
}
|
||||
}
|
|
@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
|
|||
*/
|
||||
public class GithubRepoPageProcesser implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100);
|
||||
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
|
@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor {
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new GithubRepoPageProcesser()).thread(5).run();
|
||||
Spider.create(new GithubRepoPageProcesser()).addUrl("https://github.com/code4craft").thread(5).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ import java.util.List;
|
|||
*/
|
||||
public class OschinaBlogPageProcesser implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
|
||||
private Site site = Site.me().setDomain("my.oschina.net");
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
|
@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
|
||||
Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.net.MalformedURLException;
|
|||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -88,7 +89,7 @@ public class UrlUtils {
|
|||
return stringBuilder.toString();
|
||||
}
|
||||
|
||||
public static List<Request> convertToRequests(List<String> urls) {
|
||||
public static List<Request> convertToRequests(Collection<String> urls) {
|
||||
List<Request> requestList = new ArrayList<Request>(urls.size());
|
||||
for (String url : urls) {
|
||||
requestList.add(new Request(url));
|
||||
|
@ -96,7 +97,7 @@ public class UrlUtils {
|
|||
return requestList;
|
||||
}
|
||||
|
||||
public static List<String> convertToUrls(List<Request> requests) {
|
||||
public static List<String> convertToUrls(Collection<Request> requests) {
|
||||
List<String> urlList = new ArrayList<String>(requests.size());
|
||||
for (Request request : requests) {
|
||||
urlList.add(request.getUrl());
|
||||
|
|
|
@ -11,7 +11,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @since 0.3.3
|
||||
* @since 0.4.0
|
||||
* NO implement yet!!!!!!!!!!!!
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue