diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 13733c2..5606d12 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,9 +1,5 @@ package us.codecraft.webmagic; -import org.apache.http.HttpHost; -import org.apache.http.auth.UsernamePasswordCredentials; -import us.codecraft.webmagic.proxy.ProxyProvider; - import java.util.*; /** @@ -41,12 +37,6 @@ public class Site { private Map headers = new HashMap(); - private HttpHost httpProxy; - - private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置 - - private ProxyProvider httpProxyPool; - private boolean useGzip = true; /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c8d974f..5e785af 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -479,7 +479,9 @@ public class Spider implements Runnable, Task { public List getAll(Collection urls) { destroyWhenExit = false; spawnUrl = false; - startRequests.clear(); + if (startRequests!=null){ + startRequests.clear(); + } for (Request request : UrlUtils.convertToRequests(urls)) { addRequest(request); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 1da64e7..9e17f60 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -95,12 +95,12 @@ public class HttpClientGenerator { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); - if (site != null && site.getUserAgent() != null) { + if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } - if (site == null || site.isUseGzip()) { + if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( @@ -117,16 +117,12 @@ public class HttpClientGenerator { SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); - if (site != null) { - socketConfigBuilder.setSoTimeout(site.getTimeOut()); - } + socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); - if (site != null) { - httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); - generateCookie(httpClientBuilder, site); - } + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); + generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index a0572a9..842429b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.processor; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.utils.UrlUtils; import java.util.List; @@ -18,9 +17,8 @@ public class SimplePageProcessor implements PageProcessor { private Site site; - public SimplePageProcessor(String startUrl, String urlPattern) { - this.site = Site.me().addStartUrl(startUrl). - setDomain(UrlUtils.getDomain(startUrl)); + public SimplePageProcessor(String urlPattern) { + this.site = Site.me(); //compile "*" expression to regex this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index ba29387..4f4a280 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -19,12 +19,12 @@ public class SpiderTest { @Ignore("long time") @Test public void testStartAndStop() throws InterruptedException { - Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() { + Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { System.out.println(1); } - }).thread(1); + }).thread(1).addUrl("http://www.oschina.net/"); spider.start(); Thread.sleep(10000); spider.stop(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java deleted file mode 100644 index 3c7e6ff..0000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java +++ /dev/null @@ -1,124 +0,0 @@ -package us.codecraft.webmagic.downloader; - -import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.*; -import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.pipeline.Pipeline; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.selector.Html; -import us.codecraft.webmagic.selector.PlainText; -import us.codecraft.webmagic.utils.FilePersistentBase; -import us.codecraft.webmagic.utils.UrlUtils; - -import java.io.*; - -/** - * Download file and saved to file for cache.
- * - * @author code4crafter@gmail.com - * @since 0.2.1 - */ -@Experimental -public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor { - - private Downloader downloaderWhenFileMiss; - - private final PageProcessor pageProcessor; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - public FileCache(String startUrl, String urlPattern) { - this(startUrl, urlPattern, "/data/webmagic/temp/"); - } - - public FileCache(String startUrl, String urlPattern, String path) { - this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern); - setPath(path); - downloaderWhenFileMiss = new HttpClientDownloader(); - } - - public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) { - this.downloaderWhenFileMiss = downloaderWhenFileMiss; - return this; - } - - @Override - public Page download(Request request, Task task) { - String path = this.path + "/" + task.getUUID() + "/"; - Page page = null; - try { - final File file = getFile(path + DigestUtils.md5Hex(request.getUrl())); - BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); - String line = bufferedReader.readLine(); - if (line.equals("url:\t" + request.getUrl())) { - final String html = getHtml(bufferedReader); - page = new Page(); - page.setRequest(request); - page.setUrl(PlainText.create(request.getUrl())); - page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl()))); - } - } catch (IOException e) { - if (e instanceof FileNotFoundException) { - logger.info("File not exist for url " + request.getUrl()); - } else { - logger.warn("File read error for url " + request.getUrl(), e); - } - } - if (page == null) { - page = downloadWhenMiss(request, task); - } - return page; - } - - @Override - public void setThread(int thread) { - - } - - private String getHtml(BufferedReader bufferedReader) throws IOException { - String line; - StringBuilder htmlBuilder = new StringBuilder(); - line = bufferedReader.readLine(); - line = StringUtils.removeStart(line, "html:\t"); - htmlBuilder.append(line); - while ((line = bufferedReader.readLine()) != null) { - htmlBuilder.append(line); - } - return htmlBuilder.toString(); - } - - private Page downloadWhenMiss(Request request, Task task) { - Page page = null; - if (downloaderWhenFileMiss != null) { - page = downloaderWhenFileMiss.download(request, task); - } - return page; - } - - @Override - public void process(ResultItems resultItems, Task task) { - String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; - try { - PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"))); - printWriter.println("url:\t" + resultItems.getRequest().getUrl()); - printWriter.println("html:\t" + resultItems.get("html")); - printWriter.close(); - } catch (IOException e) { - logger.warn("write file error", e); - } - } - - @Override - public void process(Page page) { - pageProcessor.process(page); - } - - @Override - public Site getSite() { - return pageProcessor.getSite(); - } -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java deleted file mode 100644 index f73b344..0000000 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java +++ /dev/null @@ -1,18 +0,0 @@ -package us.codecraft.webmagic.downloader; - -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.Spider; - -/** - * @author code4crafter@gmail.com
- */ -public class FileCacheTest { - - @Ignore("takes long") - @Test - public void test() { - FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*"); - Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run(); - } -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java index bf9e381..1c8742c 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java @@ -19,7 +19,7 @@ public class GithubRepoProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().addStartUrl("https://github.com/code4craft/webmagic"); + return Site.me(); } @Test diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index 25baa1f..8bd7d58 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -35,7 +35,7 @@ public class DiandianBlogProcessor implements PageProcessor { public Site getSite() { //site定义抽取配置,以及开始url等 if (site == null) { - site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/"). + site = Site.me().setDomain("progressdaily.diandian.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 3ceba0a..61458d0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -34,13 +34,13 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public Site getSite() { if (site==null){ - site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). + site= Site.me().setDomain("www.diaoyuweng.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); } return site; } public static void main(String[] args) { - Spider.create(new DiaoyuwengProcessor()).run(); + Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 3d27be8..8091b65 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -25,10 +25,10 @@ public class F58PageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. } public static void main(String[] args) { - Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); + Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 000cb99..1cc90b0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -21,11 +21,11 @@ public class HuxiuProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"); + return Site.me().setDomain("www.huxiu.com"); } public static void main(String[] args) { - Spider.create(new HuxiuProcessor()).run(); + Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 3ef3957..280f8f1 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -29,7 +29,7 @@ public class InfoQMiniBookProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). + site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; @@ -38,6 +38,7 @@ public class InfoQMiniBookProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new InfoQMiniBookProcessor()) .thread(5) + .addUrl("http://www.infoq.com/cn/minibooks") .run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index 26b85e8..6dce807 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -22,12 +22,12 @@ public class IteyeBlogProcessor implements PageProcessor { @Override public Site getSite() { if (site == null) { - site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"); + site = Site.me().setDomain("yanghaoli.iteye.com"); } return site; } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 0ab6c64..b373f52 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -22,11 +22,11 @@ public class KaichibaProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). + return Site.me().setDomain("kaichiba.com").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } public static void main(String[] args) { - Spider.create(new KaichibaProcessor()).run(); + Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index bfa347d..cb4c498 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -28,11 +28,11 @@ public class MeicanProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). + return Site.me().setDomain("meican.com").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } public static void main(String[] args) { - Spider.create(new MeicanProcessor()).run(); + Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 16dcb0c..ce0f817 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -1,7 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -22,6 +23,10 @@ public class NjuBBSProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"); + return Site.me().setDomain("bbs.nju.edu.cn"); + } + + public static void main(String[] args) { + Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java deleted file mode 100644 index e6db04e..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ /dev/null @@ -1,41 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.monitor.SpiderMonitor; -import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; -import us.codecraft.webmagic.scheduler.QueueScheduler; - -import javax.management.JMException; -import java.util.List; - -/** - * @author code4crafter@gmail.com
- */ -public class OschinaBlogPageProcesser implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); - page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) throws JMException { - Spider spider = Spider.create(new OschinaBlogPageProcesser()).setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000))); - SpiderMonitor.instance().register(spider); - spider.run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java deleted file mode 100644 index b75cc83..0000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ /dev/null @@ -1,27 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- */ -public class OschinaPageProcesser implements PageProcessor { - - @Override - public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all(); - page.addTargetRequests(strings); - page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); - page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); - } - - @Override - public Site getSite() { - return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index d9cee2b..037b333 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -24,7 +24,7 @@ public class QzoneBlogProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/"). + return Site.me().setDomain("www.diandian.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index d14b442..6cc8f99 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -21,6 +21,6 @@ public class TianyaPageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. } }