diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b2dd3db..40f17f0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -6,8 +6,6 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; /** *
@@ -27,7 +25,7 @@ public class Page { private Request request; - private Mapfields = new ConcurrentHashMap (); + private ResultItems resultItems = new ResultItems(); private Selectable html; @@ -35,44 +33,16 @@ public class Page { private List targetRequests = new ArrayList (); - private boolean skip; - - private Object extra; - - /** - * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 - * @return 是否忽略 true 忽略 - */ - public boolean isSkip() { - return skip; - } - - /** - * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 - * @param skip 是否忽略 true 忽略 - */ - public void setSkip(boolean skip) { - this.skip = skip; - } - public Page() { } - /** - * 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用 - * @return fields 抽取的结果 - */ - public Map getFields() { - return fields; - } - /** * 保存抽取的结果 * @param key 结果的key * @param field 结果的value */ - public void putField(String key, Selectable field) { - fields.put(key, field); + public void putField(String key, Object field) { + resultItems.put(key, field); } /** @@ -157,23 +127,10 @@ public class Page { public void setRequest(Request request) { this.request = request; + this.resultItems.setRequest(request); } - /** - * 获取附加对象 - * @param 对象类型 - * @return 对象内容 - */ - public T getExtra() { - return (T)extra; - } - - /** - * 设置附加对象 - * @param extra 对象内容 - * @param 对象类型 - */ - public void setExtra(T extra) { - this.extra = extra; + public ResultItems getResultItems() { + return resultItems; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java new file mode 100644 index 0000000..0c1d94c --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -0,0 +1,64 @@ +package us.codecraft.webmagic; + +import java.util.HashMap; +import java.util.Map; + +/** + * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
+ * @author yihua.huang@dianping.com
+ * @date: 13-7-25
+ * Time: 下午12:20
+ */ +public class ResultItems { + + private Mapfields = new HashMap (); + + private Request request; + + private boolean skip; + + public T get(String key) { + Object o = fields.get(key); + if (o == null) { + return null; + } + return (T) fields.get(key); + } + + public Map getAll() { + return fields; + } + + public ResultItems put(String key, T value) { + fields.put(key, value); + return this; + } + + public Request getRequest() { + return request; + } + + public ResultItems setRequest(Request request) { + this.request = request; + return this; + } + + /** + * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @return 是否忽略 true 忽略 + */ + public boolean isSkip() { + return skip; + } + + + /** + * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 + * @param skip + * @return this + */ + public ResultItems setSkip(boolean skip) { + this.skip = skip; + return this; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 57e29b1..a51ed96 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -196,7 +196,7 @@ public class Spider implements Runnable, Task { pageProcessor.process(page); addRequest(page); for (Pipeline pipeline : pipelines) { - pipeline.process(page, this); + pipeline.process(page.getResultItems(), this); } sleep(site.getSleepTime()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index dff2ded..72c3bf3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -1,8 +1,7 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Selectable; import java.util.Map; @@ -15,13 +14,10 @@ import java.util.Map; public class ConsolePipeline implements Pipeline{ @Override - public void process(Page page,Task task) { - System.out.println("get page: "+page.getUrl()); - for (Map.Entry entry : page.getFields().entrySet()) { - System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); - } - if (page.getExtra()!=null){ - System.out.println(page.getExtra()); + public void process(ResultItems resultItems,Task task) { + System.out.println("get page: "+resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + System.out.println(entry.getKey()+":\t"+entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index e48e2bb..0948bfe 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -2,13 +2,14 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; import org.apache.log4j.Logger; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; +import java.util.Map; /** * 持久化到文件的接口。 @@ -38,16 +39,18 @@ public class FilePipeline implements Pipeline { } @Override - public void process(Page page, Task task) { + public void process(ResultItems resultItems, Task task) { String path = this.path + "/" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()))); - printWriter.println("url:\t" + page.getUrl()); - printWriter.println("html:\t" + page.getHtml()); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()))); + printWriter.println("url:\t" + resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + printWriter.println(entry.getKey()+":\t"+entry.getValue()); + } printWriter.close(); } catch (IOException e) { logger.warn("write file error",e); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index 408392d..595a8e8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.pipeline; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; /** @@ -11,5 +11,5 @@ import us.codecraft.webmagic.Task; */ public interface Pipeline { - public void process(Page page,Task task); + public void process(ResultItems resultItems,Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 47d3748..ff96460 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -30,12 +30,13 @@ public class SimplePageProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().links().regex(urlPattern).toStrings(); + List requests = page.getHtml().links().regex(urlPattern).all(); //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); //xpath方式抽取 page.putField("title", page.getHtml().xpath("//title")); //sc表示使用Readability技术抽取正文 + page.putField("html", page.getHtml().toString()); page.putField("content", page.getHtml().smartContent()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 9e8d194..d06a531 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -82,14 +82,14 @@ public class PlainText implements Selectable { } @Override - public List toStrings() { + public List all() { return strings; } @Override public String toString() { - if (CollectionUtils.isNotEmpty(toStrings())) { - return toStrings().get(0); + if (CollectionUtils.isNotEmpty(all())) { + return all().get(0); } else { return null; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 1b0ba10..42f3d10 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -69,5 +69,5 @@ public interface Selectable { * * @return multi string result */ - public List toStrings(); + public List all(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 3ef0a92..30d8a81 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1351,7 +1351,7 @@ public class XpathSelectorTest { public void testOschina() { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); - Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings()); + Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); } } diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 8741ef4..9a045ef 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -4,7 +4,7 @@ import freemarker.template.Configuration; import freemarker.template.Template; import freemarker.template.TemplateException; import org.apache.commons.codec.digest.DigestUtils; -import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import java.io.File; @@ -39,8 +39,8 @@ public class FreemarkerPipeline implements Pipeline { @Override - public void process(Page page, Task task) { - if (page.isSkip()) { + public void process(ResultItems resultItems, Task task) { + if (resultItems.isSkip()) { return; } String path = this.path + "" + task.getUUID() + "/"; @@ -49,8 +49,8 @@ public class FreemarkerPipeline implements Pipeline { file.mkdirs(); } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString()) + ".html")); - template.process(page.getFields(), printWriter); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); + template.process(resultItems.getAll(), printWriter); printWriter.close(); } catch (TemplateException e) { } catch (IOException e) { diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e87ee33..481981d 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -28,7 +28,9 @@ public class RedisScheduler implements Scheduler{ @Override public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); + //使用SortedSet进行url去重 if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){ + //使用List保存队列 jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl()); jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl()); } diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java index 0f556d2..6db21a8 100644 --- a/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.scheduler; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -20,6 +21,7 @@ public class RedisSchedulerTest { redisScheduler = new RedisScheduler("localhost"); } + @Ignore("environment depended") @Test public void test() { Task task = new Task() { @@ -35,7 +37,6 @@ public class RedisSchedulerTest { }; redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task); Request poll = redisScheduler.poll(task); - System.out.println(poll.getUrl()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index e5aafe7..a1189e4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -20,13 +20,13 @@ public class DiandianBlogProcessor implements PageProcessor { //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 - //toString()表示取单条结果,toStrings()表示取多条 - List requests = page.getHtml().links().regex("(.*/post/.*)").toStrings(); + //toString()表示取单条结果,all()表示取多条 + List requests = page.getHtml().links().regex("(.*/post/.*)").all(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 - page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|")); + page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString()); //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 page.putField("content", page.getHtml().smartContent()); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 056da0a..b7e3ee0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -18,7 +18,7 @@ public class DianpingProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").toStrings(); + List requests = page.getHtml().links().regex("http://info-search-web121361\\.alpha\\.dp:8080/search/.*").all(); page.addTargetRequests(requests); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 695d2e2..115f183 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -18,9 +18,9 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); + List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); - requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); + requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 9d5140a..4ffe127 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -15,7 +15,7 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex(" (.*) ")); page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index f7c5f7f..0e3f9a3 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -20,7 +20,7 @@ public class GlobalProcessor implements PageProcessor { @Override public void process(Page page) { - final Listrequests = page.getHtml().links().toStrings(); + final List requests = page.getHtml().links().all(); page.addTargetRequests(requests); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 26c60cc..89b74d6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -15,7 +15,7 @@ public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); page.putField("content",page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 39f5723..a4e6e43 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -15,12 +15,12 @@ public class MeicanProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); + List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings()); + page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index a7e9c9b..2337da5 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -14,7 +14,7 @@ import java.util.List; public class NjuBBSProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); + List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 9293b41..f2dbe8e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -15,7 +15,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings(); + List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); page.putField("content", page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index f88ce06..522eb2c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -15,7 +15,7 @@ public class OschinaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index bf4dcc2..49418b6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -18,7 +18,7 @@ public class QzoneBlogProcessor implements PageProcessor { //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone - List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index baa375d..b4c5bc8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -16,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings()); + page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index 278657f..ecc55b4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -15,7 +15,7 @@ public class TianyaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); page.putField("body",page.getHtml().smartContent());