fix samples
parent
c51ac6017c
commit
c13110c4cb
|
@ -1,9 +1,5 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
|
||||||
import us.codecraft.webmagic.proxy.ProxyProvider;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -41,12 +37,6 @@ public class Site {
|
||||||
|
|
||||||
private Map<String, String> headers = new HashMap<String, String>();
|
private Map<String, String> headers = new HashMap<String, String>();
|
||||||
|
|
||||||
private HttpHost httpProxy;
|
|
||||||
|
|
||||||
private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置
|
|
||||||
|
|
||||||
private ProxyProvider httpProxyPool;
|
|
||||||
|
|
||||||
private boolean useGzip = true;
|
private boolean useGzip = true;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -479,7 +479,9 @@ public class Spider implements Runnable, Task {
|
||||||
public <T> List<T> getAll(Collection<String> urls) {
|
public <T> List<T> getAll(Collection<String> urls) {
|
||||||
destroyWhenExit = false;
|
destroyWhenExit = false;
|
||||||
spawnUrl = false;
|
spawnUrl = false;
|
||||||
startRequests.clear();
|
if (startRequests!=null){
|
||||||
|
startRequests.clear();
|
||||||
|
}
|
||||||
for (Request request : UrlUtils.convertToRequests(urls)) {
|
for (Request request : UrlUtils.convertToRequests(urls)) {
|
||||||
addRequest(request);
|
addRequest(request);
|
||||||
}
|
}
|
||||||
|
|
|
@ -95,12 +95,12 @@ public class HttpClientGenerator {
|
||||||
HttpClientBuilder httpClientBuilder = HttpClients.custom();
|
HttpClientBuilder httpClientBuilder = HttpClients.custom();
|
||||||
|
|
||||||
httpClientBuilder.setConnectionManager(connectionManager);
|
httpClientBuilder.setConnectionManager(connectionManager);
|
||||||
if (site != null && site.getUserAgent() != null) {
|
if (site.getUserAgent() != null) {
|
||||||
httpClientBuilder.setUserAgent(site.getUserAgent());
|
httpClientBuilder.setUserAgent(site.getUserAgent());
|
||||||
} else {
|
} else {
|
||||||
httpClientBuilder.setUserAgent("");
|
httpClientBuilder.setUserAgent("");
|
||||||
}
|
}
|
||||||
if (site == null || site.isUseGzip()) {
|
if (site.isUseGzip()) {
|
||||||
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
|
||||||
|
|
||||||
public void process(
|
public void process(
|
||||||
|
@ -117,16 +117,12 @@ public class HttpClientGenerator {
|
||||||
|
|
||||||
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
|
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
|
||||||
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
|
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
|
||||||
if (site != null) {
|
socketConfigBuilder.setSoTimeout(site.getTimeOut());
|
||||||
socketConfigBuilder.setSoTimeout(site.getTimeOut());
|
|
||||||
}
|
|
||||||
SocketConfig socketConfig = socketConfigBuilder.build();
|
SocketConfig socketConfig = socketConfigBuilder.build();
|
||||||
httpClientBuilder.setDefaultSocketConfig(socketConfig);
|
httpClientBuilder.setDefaultSocketConfig(socketConfig);
|
||||||
connectionManager.setDefaultSocketConfig(socketConfig);
|
connectionManager.setDefaultSocketConfig(socketConfig);
|
||||||
if (site != null) {
|
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
|
||||||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
|
generateCookie(httpClientBuilder, site);
|
||||||
generateCookie(httpClientBuilder, site);
|
|
||||||
}
|
|
||||||
return httpClientBuilder.build();
|
return httpClientBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.processor;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -18,9 +17,8 @@ public class SimplePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
private Site site;
|
private Site site;
|
||||||
|
|
||||||
public SimplePageProcessor(String startUrl, String urlPattern) {
|
public SimplePageProcessor(String urlPattern) {
|
||||||
this.site = Site.me().addStartUrl(startUrl).
|
this.site = Site.me();
|
||||||
setDomain(UrlUtils.getDomain(startUrl));
|
|
||||||
//compile "*" expression to regex
|
//compile "*" expression to regex
|
||||||
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
|
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
|
||||||
|
|
||||||
|
|
|
@ -19,12 +19,12 @@ public class SpiderTest {
|
||||||
@Ignore("long time")
|
@Ignore("long time")
|
||||||
@Test
|
@Test
|
||||||
public void testStartAndStop() throws InterruptedException {
|
public void testStartAndStop() throws InterruptedException {
|
||||||
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
|
Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
System.out.println(1);
|
System.out.println(1);
|
||||||
}
|
}
|
||||||
}).thread(1);
|
}).thread(1).addUrl("http://www.oschina.net/");
|
||||||
spider.start();
|
spider.start();
|
||||||
Thread.sleep(10000);
|
Thread.sleep(10000);
|
||||||
spider.stop();
|
spider.stop();
|
||||||
|
|
|
@ -1,124 +0,0 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import us.codecraft.webmagic.*;
|
|
||||||
import us.codecraft.webmagic.utils.Experimental;
|
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
|
||||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
|
||||||
import us.codecraft.webmagic.selector.Html;
|
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
|
||||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Download file and saved to file for cache.<br>
|
|
||||||
*
|
|
||||||
* @author code4crafter@gmail.com
|
|
||||||
* @since 0.2.1
|
|
||||||
*/
|
|
||||||
@Experimental
|
|
||||||
public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
|
|
||||||
|
|
||||||
private Downloader downloaderWhenFileMiss;
|
|
||||||
|
|
||||||
private final PageProcessor pageProcessor;
|
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
public FileCache(String startUrl, String urlPattern) {
|
|
||||||
this(startUrl, urlPattern, "/data/webmagic/temp/");
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileCache(String startUrl, String urlPattern, String path) {
|
|
||||||
this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
|
|
||||||
setPath(path);
|
|
||||||
downloaderWhenFileMiss = new HttpClientDownloader();
|
|
||||||
}
|
|
||||||
|
|
||||||
public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
|
|
||||||
this.downloaderWhenFileMiss = downloaderWhenFileMiss;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Page download(Request request, Task task) {
|
|
||||||
String path = this.path + "/" + task.getUUID() + "/";
|
|
||||||
Page page = null;
|
|
||||||
try {
|
|
||||||
final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
|
|
||||||
BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
|
|
||||||
String line = bufferedReader.readLine();
|
|
||||||
if (line.equals("url:\t" + request.getUrl())) {
|
|
||||||
final String html = getHtml(bufferedReader);
|
|
||||||
page = new Page();
|
|
||||||
page.setRequest(request);
|
|
||||||
page.setUrl(PlainText.create(request.getUrl()));
|
|
||||||
page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
if (e instanceof FileNotFoundException) {
|
|
||||||
logger.info("File not exist for url " + request.getUrl());
|
|
||||||
} else {
|
|
||||||
logger.warn("File read error for url " + request.getUrl(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (page == null) {
|
|
||||||
page = downloadWhenMiss(request, task);
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setThread(int thread) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getHtml(BufferedReader bufferedReader) throws IOException {
|
|
||||||
String line;
|
|
||||||
StringBuilder htmlBuilder = new StringBuilder();
|
|
||||||
line = bufferedReader.readLine();
|
|
||||||
line = StringUtils.removeStart(line, "html:\t");
|
|
||||||
htmlBuilder.append(line);
|
|
||||||
while ((line = bufferedReader.readLine()) != null) {
|
|
||||||
htmlBuilder.append(line);
|
|
||||||
}
|
|
||||||
return htmlBuilder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Page downloadWhenMiss(Request request, Task task) {
|
|
||||||
Page page = null;
|
|
||||||
if (downloaderWhenFileMiss != null) {
|
|
||||||
page = downloaderWhenFileMiss.download(request, task);
|
|
||||||
}
|
|
||||||
return page;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(ResultItems resultItems, Task task) {
|
|
||||||
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
|
|
||||||
try {
|
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
|
|
||||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
|
||||||
printWriter.println("html:\t" + resultItems.get("html"));
|
|
||||||
printWriter.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.warn("write file error", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(Page page) {
|
|
||||||
pageProcessor.process(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Site getSite() {
|
|
||||||
return pageProcessor.getSite();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,18 +0,0 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
|
||||||
|
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
|
||||||
import us.codecraft.webmagic.Spider;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
*/
|
|
||||||
public class FileCacheTest {
|
|
||||||
|
|
||||||
@Ignore("takes long")
|
|
||||||
@Test
|
|
||||||
public void test() {
|
|
||||||
FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
|
|
||||||
Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -19,7 +19,7 @@ public class GithubRepoProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
|
return Site.me();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class DiandianBlogProcessor implements PageProcessor {
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
//site定义抽取配置,以及开始url等
|
//site定义抽取配置,以及开始url等
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/").
|
site = Site.me().setDomain("progressdaily.diandian.com").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
|
|
|
@ -34,13 +34,13 @@ public class DiaoyuwengProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site==null){
|
if (site==null){
|
||||||
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
|
site= Site.me().setDomain("www.diaoyuweng.com").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500);
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new DiaoyuwengProcessor()).run();
|
Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,10 +25,10 @@ public class F58PageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
|
return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates.
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run();
|
Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,11 +21,11 @@ public class HuxiuProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
|
return Site.me().setDomain("www.huxiu.com");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new HuxiuProcessor()).run();
|
Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
|
site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
|
@ -38,6 +38,7 @@ public class InfoQMiniBookProcessor implements PageProcessor {
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new InfoQMiniBookProcessor())
|
Spider.create(new InfoQMiniBookProcessor())
|
||||||
.thread(5)
|
.thread(5)
|
||||||
|
.addUrl("http://www.infoq.com/cn/minibooks")
|
||||||
.run();
|
.run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,12 +22,12 @@ public class IteyeBlogProcessor implements PageProcessor {
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
if (site == null) {
|
if (site == null) {
|
||||||
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
|
site = Site.me().setDomain("yanghaoli.iteye.com");
|
||||||
}
|
}
|
||||||
return site;
|
return site;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new IteyeBlogProcessor()).thread(5).run();
|
Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,11 +22,11 @@ public class KaichibaProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
|
return Site.me().setDomain("kaichiba.com").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new KaichibaProcessor()).run();
|
Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,11 +28,11 @@ public class MeicanProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8").
|
return Site.me().setDomain("meican.com").setCharset("utf-8").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new MeicanProcessor()).run();
|
Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package us.codecraft.webmagic.samples;
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -22,6 +23,10 @@ public class NjuBBSProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
|
return Site.me().setDomain("bbs.nju.edu.cn");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,41 +0,0 @@
|
||||||
package us.codecraft.webmagic.samples;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Spider;
|
|
||||||
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
|
||||||
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
|
|
||||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
|
||||||
|
|
||||||
import javax.management.JMException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
*/
|
|
||||||
public class OschinaBlogPageProcesser implements PageProcessor {
|
|
||||||
|
|
||||||
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(Page page) {
|
|
||||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
|
||||||
page.addTargetRequests(links);
|
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
|
|
||||||
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
|
|
||||||
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Site getSite() {
|
|
||||||
return site;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws JMException {
|
|
||||||
Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)));
|
|
||||||
SpiderMonitor.instance().register(spider);
|
|
||||||
spider.run();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,27 +0,0 @@
|
||||||
package us.codecraft.webmagic.samples;
|
|
||||||
|
|
||||||
import us.codecraft.webmagic.Site;
|
|
||||||
import us.codecraft.webmagic.Page;
|
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author code4crafter@gmail.com <br>
|
|
||||||
*/
|
|
||||||
public class OschinaPageProcesser implements PageProcessor {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void process(Page page) {
|
|
||||||
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
|
|
||||||
page.addTargetRequests(strings);
|
|
||||||
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
|
|
||||||
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Site getSite() {
|
|
||||||
return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/").
|
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -24,7 +24,7 @@ public class QzoneBlogProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/").
|
return Site.me().setDomain("www.diandian.com").
|
||||||
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,6 @@ public class TianyaPageProcesser implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Site getSite() {
|
public Site getSite() {
|
||||||
return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
|
return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue