diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index bebbb83..e13b809 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1351,6 +1351,7 @@ public class XpathSelectorTest { public void testOschina() { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); + System.out.println(html1.regex("(.*?)").links().toStrings()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index bb77931..baa375d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -1,7 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -15,7 +16,7 @@ public class SinaBlogProcesser implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().links().regex("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); + page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().toStrings()); page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); @@ -26,9 +27,13 @@ public class SinaBlogProcesser implements PageProcessor { @Override public Site getSite() { if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). + site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcesser()).run(); + } } diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 5680d12..33bcf9c 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.processor; +import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; @@ -16,6 +17,7 @@ import java.io.IOException; */ public class DiaoyuwengProcessorTest { + @Ignore @Test public void test() throws IOException { DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();