From b1cba78bd6930bbbc3d44b4825fcc752932ca02c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 07:30:31 +0800 Subject: [PATCH 1/8] xsoup test --- webmagic-saxon/pom.xml | 5 ++ .../webmagic/selector/XpathSelectorTest.java | 77 +++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index efa8291..1c4e745 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -17,6 +17,11 @@ webmagic-core ${project.version} + + us.codecraft + xsoup + 0.0.1-SNAPSHOT + net.sf.saxon Saxon-HE diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index b623040..6c19c8a 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,8 +1,15 @@ package us.codecraft.webmagic.selector; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.htmlcleaner.XPatherException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 @@ -1353,6 +1360,7 @@ public class XpathSelectorTest { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); + Selectors.xpath("/abc/").select(""); } @Test @@ -1379,17 +1387,86 @@ public class XpathSelectorTest { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } + System.out.println(System.currentTimeMillis() - time); + + CssSelector cssSelector = new CssSelector("a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + cssSelector.selectList(html); + } + System.out.println("css "+(System.currentTimeMillis()-time)); + } + + @Ignore("take long time") + @Test + public void parserPerformanceTest() throws XPatherException { + System.out.println(html.length()); + + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(html); + Document document = Jsoup.parse(html); + + long time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + Jsoup.parse(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + document.select("a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + XPathEvaluator compile = Xsoup.compile("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + compile.evaluate(document); + } + System.out.println(System.currentTimeMillis()-time); + } } From 55d4a76ab7f6238a60e917371ea54164d569edab Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 08:21:32 +0800 Subject: [PATCH 2/8] newselectors --- webmagic-core/pom.xml | 6 ++++ .../selector/BaseElementSelector.java | 23 +++++++++++++ .../webmagic/selector/CssSelector.java | 26 +++++++-------- .../webmagic/selector/ElementSelector.java | 32 +++++++++++++++++++ .../webmagic/selector/XsoupSelector.java | 32 +++++++++++++++++++ 5 files changed, 104 insertions(+), 15 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b19820d..ef9f84a 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -25,6 +25,12 @@ commons-lang3
+ + us.codecraft + xsoup + 0.0.1-SNAPSHOT + + log4j log4j diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java new file mode 100644 index 0000000..d14a708 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; + +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public abstract class BaseElementSelector implements Selector,ElementSelector { + + @Override + public String select(String text) { + return select(Jsoup.parse(text)); + } + + @Override + public List selectList(String text) { + return selectList(Jsoup.parse(text)); + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 5031077..9c7032c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,8 +1,6 @@ package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -15,7 +13,7 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class CssSelector implements Selector { +public class CssSelector extends BaseElementSelector { private String selectorText; @@ -30,16 +28,6 @@ public class CssSelector implements Selector { this.attrName = attrName; } - @Override - public String select(String text) { - Document doc = Jsoup.parse(text); - Elements elements = doc.select(selectorText); - if (CollectionUtils.isEmpty(elements)) { - return null; - } - return getValue(elements.get(0)); - } - private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); @@ -51,9 +39,17 @@ public class CssSelector implements Selector { } @Override - public List selectList(String text) { + public String select(Element element) { + Elements elements = element.select(selectorText); + if (CollectionUtils.isEmpty(elements)) { + return null; + } + return getValue(elements.get(0)); + } + + @Override + public List selectList(Element doc) { List strings = new ArrayList(); - Document doc = Jsoup.parse(text); Elements elements = doc.select(selectorText); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java new file mode 100644 index 0000000..793b825 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.List; + +/** + * Selector(extractor) for html elements.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public interface ElementSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param element + * @return result + */ + public String select(Element element); + + /** + * Extract all results in text.
+ * + * @param element + * @return results + */ + public List selectList(Element element); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java new file mode 100644 index 0000000..698b29b --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; + +import java.util.List; + +/** + * XPath selector based on Xsoup.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class XsoupSelector extends BaseElementSelector { + + private XPathEvaluator xPathEvaluator; + + public XsoupSelector(String xpathStr) { + this.xPathEvaluator = Xsoup.compile(xpathStr); + } + + @Override + public String select(Element element) { + return xPathEvaluator.evaluate(element).get(); + } + + @Override + public List selectList(Element element) { + return xPathEvaluator.evaluate(element).list(); + } +} From d7cd9e5747859b41cc5d97fbebfc80bdc88ad78b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 11:56:01 +0800 Subject: [PATCH 3/8] update pom --- .../main/java/us/codecraft/webmagic/selector/Html.java | 2 +- .../java/us/codecraft/webmagic/utils/ExtractorUtils.java | 9 +++------ webmagic-samples/pom.xml | 2 +- .../codecraft/webmagic/samples/DiaoyuwengProcessor.java | 7 ++++++- .../us/codecraft/webmagic/samples/F58PageProcesser.java | 9 +++++++-- .../us/codecraft/webmagic/samples/HuxiuProcessor.java | 5 +++++ 6 files changed, 23 insertions(+), 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index f3d29aa..493c762 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -59,7 +59,7 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - XpathSelector xpathSelector = Selectors.xpath(xpath); + XsoupSelector xpathSelector = new XsoupSelector(xpath); return selectList(xpathSelector, strings); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 5c6ebbf..1099636 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -1,10 +1,7 @@ package us.codecraft.webmagic.utils; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.selector.CssSelector; -import us.codecraft.webmagic.selector.RegexSelector; -import us.codecraft.webmagic.selector.Selector; -import us.codecraft.webmagic.selector.XpathSelector; +import us.codecraft.webmagic.selector.*; import java.util.ArrayList; import java.util.List; @@ -27,10 +24,10 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = new XpathSelector(value); + selector = new XsoupSelector(value); break; default: - selector = new XpathSelector(value); + selector = new XsoupSelector(value); } return selector; } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 35ddcaa..a349a68 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1 + 0.2.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 115f183..3ceba0a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.PlainText; @@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor { page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody")); + page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } @@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor { } return site; } + + public static void main(String[] args) { + Spider.create(new DiaoyuwengProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 4ffe127..7124a8c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all(); + List strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("(.*)")); - page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); + page.putField("body",page.getHtml().xpath("//dd")); } @Override public Site getSite() { return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. } + + public static void main(String[] args) { + Spider.create(new F58PageProcesser()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 89b74d6..4ac9310 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -26,4 +27,8 @@ public class HuxiuProcessor implements PageProcessor { return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new HuxiuProcessor()).run(); + } } From 85b7cf1563337ae07e448d3de0f5c5939fa676b6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 13:52:41 +0800 Subject: [PATCH 4/8] complete test --- .../main/java/us/codecraft/webmagic/selector/Html.java | 2 +- .../us/codecraft/webmagic/samples/HuxiuProcessor.java | 5 ++--- .../webmagic/samples/InfoQMiniBookProcessor.java | 4 ---- .../codecraft/webmagic/samples/IteyeBlogProcessor.java | 3 +-- .../us/codecraft/webmagic/samples/KaichibaProcessor.java | 5 +++++ .../us/codecraft/webmagic/samples/MeicanProcessor.java | 9 +++++++-- .../webmagic/samples/OschinaBlogPageProcesser.java | 9 ++++----- 7 files changed, 20 insertions(+), 17 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 493c762..a4ea0d3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -53,7 +53,7 @@ public class Html extends PlainText { @Override public Selectable links() { - XpathSelector xpathSelector = Selectors.xpath("//a/@href"); + XsoupSelector xpathSelector = new XsoupSelector("//a/@href"); return selectList(xpathSelector, strings); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 4ac9310..136eeb8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -15,10 +15,9 @@ import java.util.List; public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all(); + List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); page.putField("content",page.getHtml().smartContent()); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index b43c3c5..38de3bc 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new InfoQMiniBookProcessor()) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/temp/webmagic/")) .thread(5) .run(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index c0b3f73..f80f895 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor { } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index aff18a6..0ab6c64 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor { return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new KaichibaProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index a4e6e43..bfa347d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor { } page.addTargetRequests(requests); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); - page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); + page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); } @Override @@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor { return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new MeicanProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 8ba7063..e447003 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -1,9 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor { public void process(Page page) { List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); + page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run(); + Spider.create(new OschinaBlogPageProcesser()).run(); } } From 2c3574537afd2707251e82d248f260cc2e333356 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 2 Sep 2013 14:14:24 +0800 Subject: [PATCH 5/8] refactor in selectors --- .../us/codecraft/webmagic/selector/Html.java | 42 +++++---- .../webmagic/selector/PlainText.java | 14 +-- .../webmagic/selector/Selectable.java | 14 --- .../webmagic/selector/SelectorFactory.java | 91 ------------------- .../webmagic/selector/Selectors.java | 20 ++-- .../selector/TextContentSelector.java | 68 -------------- .../selector/TextContentSelectorTest.java | 34 ------- 7 files changed, 35 insertions(+), 248 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index a4ea0d3..06987d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -1,5 +1,8 @@ package us.codecraft.webmagic.selector; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + import java.util.ArrayList; import java.util.List; @@ -11,12 +14,23 @@ import java.util.List; */ public class Html extends PlainText { + /** + * Store parsed document for better performance when only one text exist. + */ + private Document document; + public Html(List strings) { super(strings); } public Html(String text) { super(text); + this.document = Jsoup.parse(text); + } + + public Html(Document document) { + super(document.html()); + this.document = document; } public static Html create(String text) { @@ -53,38 +67,34 @@ public class Html extends PlainText { @Override public Selectable links() { - XsoupSelector xpathSelector = new XsoupSelector("//a/@href"); - return selectList(xpathSelector, strings); + return xpath("//a/@href"); } @Override public Selectable xpath(String xpath) { - XsoupSelector xpathSelector = new XsoupSelector(xpath); - return selectList(xpathSelector, strings); + XsoupSelector xsoupSelector = new XsoupSelector(xpath); + if (document!=null){ + return new Html(xsoupSelector.selectList(document)); + } + return selectList(xsoupSelector, strings); } @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); + if (document!=null){ + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } @Override public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); + if (document!=null){ + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } - @Override - public Selectable text() { - TextContentSelector selector = Selectors.text(); - return select(selector, strings); - } - - @Override - public Selectable text(String newlineSeparator) { - TextContentSelector selector = Selectors.text(newlineSeparator); - return select(selector, strings); - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index df6926d..9406f3a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -89,7 +89,7 @@ public class PlainText implements Selectable { @Override public Selectable replace(String regex, String replacement) { - ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); + ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); return select(replaceSelector, strings); } @@ -107,18 +107,6 @@ public class PlainText implements Selectable { } } - @Override - public Selectable text() { - //do nothing - return this; - } - - @Override - public Selectable text(String newlineSeparator) { - //do nothing - return this; - } - @Override public boolean match() { return strings != null && strings.size() > 0; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 398906f..66df5d5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -82,20 +82,6 @@ public interface Selectable { */ public String toString(); - /** - * select text content of html - * - * @return text - */ - public Selectable text(); - - /** - * select text content of html - * - * @return text - */ - public Selectable text(String newlineSeparator); - /** * if result exist for select * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java deleted file mode 100644 index 8a0c76c..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ /dev/null @@ -1,91 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.apache.commons.lang3.StringUtils; - -import java.lang.reflect.Constructor; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Selector factory with some inner cache.
- * - * @author code4crafter@gmail.com
- * @since 0.1.0 - */ -public class SelectorFactory { - - private Map innerCache = new ConcurrentHashMap(); - - private static final SelectorFactory INSTATNCE = new SelectorFactory(); - - public static SelectorFactory getInstatnce() { - return INSTATNCE; - } - - public RegexSelector newRegexSelector(String regex) { - return newSelector(RegexSelector.class, regex); - } - - public RegexSelector newRegexSelector(String regex, int group) { - String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group)); - if (innerCache.get(cacheKey) != null) { - return (RegexSelector) innerCache.get(cacheKey); - } - return new RegexSelector(regex, group); - } - - public ReplaceSelector newReplaceSelector(String regex, String replacement) { - return newSelector(ReplaceSelector.class, regex, replacement); - } - - public XpathSelector newXpathSelector(String xpath) { - return newSelector(XpathSelector.class, xpath); - } - - public SmartContentSelector newSmartContentSelector() { - return newSelector(SmartContentSelector.class); - } - - public T newAndCacheSelector(Class clazz, String... param) { - String cacheKey = getCacheKey(RegexSelector.class, param); - if (innerCache.get(cacheKey) != null) { - return (T) innerCache.get(cacheKey); - } - T selector = newSelector(clazz, param); - if (selector != null) { - innerCache.put(cacheKey, selector); - } - return selector; - - } - - public T newSelector(Class clazz, String... param) { - try { - if (param.length == 0) { - Constructor constructor - = clazz.getConstructor(); - T selector = constructor.newInstance(); - return selector; - } else if (param.length == 1) { - Constructor constructor - = clazz.getConstructor(String.class); - T selector = constructor.newInstance(param[0]); - return selector; - } else if (param.length == 2) { - Constructor constructor - = clazz.getConstructor(String.class, String.class); - T selector = constructor.newInstance(param[0], param[1]); - return selector; - } else { - throw new UnsupportedOperationException(); - } - } catch (Exception e) { - throw new IllegalArgumentException("init object error", e); - } - } - - private String getCacheKey(Class clazz, String... param) { - return clazz.toString() + "_" + StringUtils.join(param, "_"); - } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 051d6a4..9764641 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector; public abstract class Selectors { public static RegexSelector regex(String expr) { - return SelectorFactory.getInstatnce().newRegexSelector(expr); + return new RegexSelector(expr); } public static RegexSelector regex(String expr, int group) { - return SelectorFactory.getInstatnce().newRegexSelector(expr, group); + return new RegexSelector(expr,group); } public static SmartContentSelector smartContent() { - return SelectorFactory.getInstatnce().newSmartContentSelector(); + return new SmartContentSelector(); } public static CssSelector $(String expr) { @@ -29,7 +29,11 @@ public abstract class Selectors { } public static XpathSelector xpath(String expr) { - return SelectorFactory.getInstatnce().newXpathSelector(expr); + return new XpathSelector(expr); + } + + public static XsoupSelector xsoup(String expr) { + return new XsoupSelector(expr); } public static AndSelector and(Selector... selectors) { @@ -40,14 +44,6 @@ public abstract class Selectors { return new OrSelector(selectors); } - public static TextContentSelector text() { - return new TextContentSelector(); - } - - public static TextContentSelector text(String newlineSeperator) { - return new TextContentSelector(newlineSeperator); - } - public static void main(String[] args) { String s = "a"; or(regex("(.*)"), xpath("//title"), $("title")).select(s); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java deleted file mode 100644 index 54e8204..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java +++ /dev/null @@ -1,68 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -/** - * Extract text content in html.
- * Algorithm from http://www.elias.cn/En/ExtMainText.
- * - * @author code4crafter@gmail.com
- * @since 0.2.2 - */ -public class TextContentSelector implements Selector { - - private String newLineSeperator = "\n"; - - public TextContentSelector() { - } - - public TextContentSelector(String newLineSeperator) { - this.newLineSeperator = newLineSeperator; - } - - private final static Set TAGS_IN_NEWLINE = new HashSet(); - - private final static Set TAGS_TO_IGNORE = new HashSet(); - - static { - TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"})); - TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"})); - } - - @Override - public String select(String text) { - Document doc = Jsoup.parse(text); - return select0(doc); - } - - protected String select0(Element element) { - String tagName = element.tagName().toLowerCase(); - if (TAGS_TO_IGNORE.contains(tagName)) { - return ""; - } - StringBuilder textBuilder = new StringBuilder(); - textBuilder.append(element.text()); - if (element.children() != null) { - for (Element child : element.children()) { - textBuilder.append(select0(child)); - } - } - if (TAGS_IN_NEWLINE.contains(tagName)) { - textBuilder.append(newLineSeperator); - } - return textBuilder.toString(); - } - - @Override - public List selectList(String text) { - throw new UnsupportedOperationException(); - } - -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java deleted file mode 100644 index f501824..0000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.selector; - -import junit.framework.Assert; -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.downloader.HttpClientDownloader; - -/** - * @author code4crafter@gmail.com
- * @since 0.2.2 - */ -public class TextContentSelectorTest { - - @Test - public void test() { - String html = "
\n" + - "
\n" + - "

Add more powerful selector for content text extract refered to http://www.elias.cn/En/ExtMainText

\n" + - "
\n" + - "
"; - TextContentSelector textContentSelector = new TextContentSelector("
"); - String text = textContentSelector.select(html); - Assert.assertNotNull(text); - } - - @Ignore("takes long time") - @Test - public void testDownload() { - String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8") - .smartContent().text().toString(); - Assert.assertNotNull(s); - } - -} From 326b97c65a3e9516d06ef7e46da53757ac04f175 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 00:15:54 +0800 Subject: [PATCH 6/8] update --- .../main/java/us/codecraft/webmagic/Page.java | 7 ++-- .../webmagic/selector/CacheElement.java | 36 +++++++++++++++++++ .../us/codecraft/webmagic/selector/Html.java | 30 ++++++++++++++++ .../webmagic/model/PageModelExtractor.java | 19 ++++++---- 4 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 93c184d..0821e6d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; @@ -28,7 +29,7 @@ public class Page { private ResultItems resultItems = new ResultItems(); - private Selectable html; + private Html html; private Selectable url; @@ -58,11 +59,11 @@ public class Page { * * @return html */ - public Selectable getHtml() { + public Html getHtml() { return html; } - public void setHtml(Selectable html) { + public void setHtml(Html html) { this.html = html; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java new file mode 100644 index 0000000..a58eba2 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.List; + +/** + * Cache parsed element for extract. + * + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public class CacheElement { + + public String text; + + public Element element; + + public String select(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.select(getElement()); + } else { + return selector.select(getText()); + } + } + + public List selectList(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.selectList(getElement()); + } else { + return selector.selectList(getText()); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 06987d8..74aa976 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -97,4 +97,34 @@ public class Html extends PlainText { return selectList(cssSelector, strings); } + public Document getDocument() { + return document; + } + + public String getText() { + return document.html(); + } + + /** + * + * @param selector + * @return + */ + public String select(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.select(getDocument()); + } else { + return selector.select(getText()); + } + } + + public List selectList(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.selectList(getDocument()); + } else { + return selector.selectList(getText()); + } + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index a16c7a1..8849052 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; @@ -34,7 +35,7 @@ class PageModelExtractor { private List fieldExtractors; - private Extractor extractor; + private Extractor objectExtractor; public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); @@ -169,7 +170,7 @@ class PageModelExtractor { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); } } @@ -183,12 +184,12 @@ class PageModelExtractor { if (!matched) { return null; } - if (extractor == null) { + if (objectExtractor == null) { return processSingle(page, page.getHtml().toString()); } else { - if (extractor.multi) { + if (objectExtractor.multi) { List os = new ArrayList(); - List list = extractor.getSelector().selectList(page.getHtml().toString()); + List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { Object o = processSingle(page, s); if (o != null) { @@ -197,13 +198,19 @@ class PageModelExtractor { } return os; } else { - String select = extractor.getSelector().select(page.getHtml().toString()); + String select = objectExtractor.getSelector().select(page.getHtml().toString()); Object o = processSingle(page, select); return o; } } } + private List select(Selector selector,Element element,String html){ + if (selector instanceof ElementSelector){ + + } + } + private Object processSingle(Page page, String html) { Object o = null; try { From 194518fd82f31e1a08f8966f26324c2e9381ddc3 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 08:21:34 +0800 Subject: [PATCH 7/8] add switch --- .../java/us/codecraft/webmagic/Spider.java | 9 + .../webmagic/selector/CacheElement.java | 36 - .../us/codecraft/webmagic/selector/Html.java | 26 +- .../webmagic/utils/EnvironmentUtil.java | 28 + .../webmagic/utils/EnvironmentUtilTest.java | 18 + .../webmagic/model/PageModelExtractor.java | 31 +- .../webmagic/utils/ExtractorUtils.java | 17 +- .../codecraft/model/ProcessorBenchmark.java | 890 ++++++++++++++++++ 8 files changed, 992 insertions(+), 63 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java create mode 100644 webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c5c239f..723e805 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; import java.io.Closeable; @@ -368,6 +369,14 @@ public class Spider implements Runnable, Task { return this; } + /** + * switch off xsoup + * @return + */ + public static void xsoupOff(){ + EnvironmentUtil.setUseXsoup(false); + } + @Override public String getUUID() { if (uuid != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java deleted file mode 100644 index a58eba2..0000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CacheElement.java +++ /dev/null @@ -1,36 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.nodes.Element; - -import java.util.List; - -/** - * Cache parsed element for extract. - * - * @author code4crafter@gmail.com - * @since 0.2.2 - */ -public class CacheElement { - - public String text; - - public Element element; - - public String select(Selector selector) { - if (selector instanceof ElementSelector) { - ElementSelector elementSelector = (ElementSelector) selector; - return elementSelector.select(getElement()); - } else { - return selector.select(getText()); - } - } - - public List selectList(Selector selector) { - if (selector instanceof ElementSelector) { - ElementSelector elementSelector = (ElementSelector) selector; - return elementSelector.selectList(getElement()); - } else { - return selector.selectList(getText()); - } - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 74aa976..1798824 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import us.codecraft.webmagic.utils.EnvironmentUtil; import java.util.ArrayList; import java.util.List; @@ -72,17 +73,22 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - XsoupSelector xsoupSelector = new XsoupSelector(xpath); - if (document!=null){ - return new Html(xsoupSelector.selectList(document)); + if (EnvironmentUtil.useXsoup()) { + XsoupSelector xsoupSelector = new XsoupSelector(xpath); + if (document != null) { + return new Html(xsoupSelector.selectList(document)); + } + return selectList(xsoupSelector, strings); + } else { + XpathSelector xpathSelector = new XpathSelector(xpath); + return selectList(xpathSelector, strings); } - return selectList(xsoupSelector, strings); } @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); - if (document!=null){ + if (document != null) { return new Html(cssSelector.selectList(document)); } return selectList(cssSelector, strings); @@ -91,7 +97,7 @@ public class Html extends PlainText { @Override public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); - if (document!=null){ + if (document != null) { return new Html(cssSelector.selectList(document)); } return selectList(cssSelector, strings); @@ -102,15 +108,17 @@ public class Html extends PlainText { } public String getText() { + if (strings!=null&&strings.size()>0){ + return strings.get(0); + } return document.html(); } /** - * * @param selector * @return */ - public String select(Selector selector) { + public String selectDocument(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.select(getDocument()); @@ -119,7 +127,7 @@ public class Html extends PlainText { } } - public List selectList(Selector selector) { + public List selectDocumentForList(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.selectList(getDocument()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java new file mode 100644 index 0000000..1d63aec --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.utils; + +import org.apache.commons.lang3.BooleanUtils; + +import java.util.Properties; + +/** + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public abstract class EnvironmentUtil { + + private static final String USE_XSOUP = "xsoup"; + + public static boolean useXsoup() { + Properties properties = System.getProperties(); + Object o = properties.get(USE_XSOUP); + if (o == null) { + return true; + } + return BooleanUtils.toBoolean(((String) o).toLowerCase()); + } + + public static void setUseXsoup(boolean useXsoup) { + Properties properties = System.getProperties(); + properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false")); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java new file mode 100644 index 0000000..cb620e7 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Test; + +import static junit.framework.Assert.*; + +/** + * @author code4crafter@gmail.com + */ +public class EnvironmentUtilTest { + + @Test + public void test() { + assertTrue(EnvironmentUtil.useXsoup()); + EnvironmentUtil.setUseXsoup(false); + assertFalse(EnvironmentUtil.useXsoup()); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 8849052..03cd3a3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; -import org.jsoup.nodes.Element; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; @@ -185,13 +184,13 @@ class PageModelExtractor { return null; } if (objectExtractor == null) { - return processSingle(page, page.getHtml().toString()); + return processSingle(page, null, false); } else { if (objectExtractor.multi) { List os = new ArrayList(); List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { - Object o = processSingle(page, s); + Object o = processSingle(page, s, false); if (o != null) { os.add(o); } @@ -199,19 +198,13 @@ class PageModelExtractor { return os; } else { String select = objectExtractor.getSelector().select(page.getHtml().toString()); - Object o = processSingle(page, select); + Object o = processSingle(page, select, false); return o; } } } - private List select(Selector selector,Element element,String html){ - if (selector instanceof ElementSelector){ - - } - } - - private Object processSingle(Page page, String html) { + private Object processSingle(Page page, String html, boolean isRaw) { Object o = null; try { o = clazz.newInstance(); @@ -220,10 +213,14 @@ class PageModelExtractor { List value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().selectList(html); + if (isRaw) { + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().selectList(html); + } break; case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); @@ -239,10 +236,14 @@ class PageModelExtractor { String value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().select(html); + if (isRaw) { + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().select(html); + } break; case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 1099636..2d9fd51 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -8,6 +8,7 @@ import java.util.List; /** * Tools for annotation converting.
+ * * @author code4crafter@gmail.com
* @since 0.2.1 */ @@ -24,17 +25,27 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = new XsoupSelector(value); + selector = getXpathSelector(value); break; default: - selector = new XsoupSelector(value); + selector = getXpathSelector(value); + } + return selector; + } + + private static Selector getXpathSelector(String value) { + Selector selector; + if (EnvironmentUtil.useXsoup()) { + selector = new XsoupSelector(value); + } else { + selector = new XpathSelector(value); } return selector; } public static List getSelectors(ExtractBy[] extractBies) { List selectors = new ArrayList(); - if (extractBies==null){ + if (extractBies == null) { return selectors; } for (ExtractBy extractBy : extractBies) { diff --git a/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java new file mode 100644 index 0000000..c3f2829 --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/model/ProcessorBenchmark.java @@ -0,0 +1,890 @@ +package us.codecraft.webmagic.model; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class ProcessorBenchmark { + + @Test + public void test() { + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); + Page page = new Page(); + page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); + page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); + page.setHtml(new Html(html)); + long time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + } + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t\t
\n" + + " \t开源中国社区\n" + + "
\n" + + "
开源项目发现、使用和交流平台
\n" + + "\t\t
\n" + + " \t\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\n" + + "\t\t当前访客身份:\n" + + "\t\t\t\t黄亿华 [ 退出 ]\n" + + "\t\t\t\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\n" + + "\t\t
\n" + + "\t\t
\n" + + " \t\t
\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + + " \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t
\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + "
软件
\n" + + " \n" + + "
\n" + + "\t\t\t\t\t\t\t\n" + + " \t\t
\n" + + "\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "\t
\t\n" + + "\n" + + "
\n" + + "
\n" + + "\t\t切换风格 \"黄亿华\"\n" + + " \n" + + " 黄亿华\n" + + "\t\t\n" + + "\t\t\t\n" + + " \t\t\t修改资料\n" + + "\t\t\t更换头像\n" + + " \t\t\n" + + " \n" + + "
\n" + + "
\n" + + " \t关注(43)\n" + + " \t粉丝(98)\n" + + " \t积分(173)\n" + + "
\n" + + "
\n" + + "
\n" + + "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t.发表博文\n" + + "\t.空间管理\n" + + "
\n" + + " 管理» 博客分类\n" + + " \n" + + "
\n" + + "
\n" + + " 管理» 最新评论 \n" + + "
    \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:极好的工具,\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t
\n" + + "
\n" + + "
\n" + + "访客统计\n" + + "
    \n" + + "\t
  • 6 (查看最新访客»)
  • \n" + + "
  • 284
  • \n" + + "
  • 817
  • \n" + + "
  • 1888
  • \n" + + "
  • 16453
  • \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\t
\n" + + " \t\n" + + "\t
\n" + + "\t\n" + + " \t
\t\t\n" + + "
\n" + + "

Jsoup代码解读之八-防御XSS攻击

\n" + + "
\n" + + " \t\t \t\t \t\t\n" + + " \t\t\t编辑 | 删除\n" + + " \t\t\n" + + "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + + " \t\t已有1628次阅读 ,共3个评论\n" + + " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + + "
\n" + + "\t \t
\n" + + "

目录:[ - ]

\n" + + " \n" + + " \t
\n" + + " \n" + + "\t \t

\n" + + "\n" + + "

防御XSS攻击的一般原理

\n" + + "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + + "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + + "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + + "
    \n" + + "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + + "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + + "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + + "
\n" + + "\n" + + "

Cleaner与Whitelist

\n" + + "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + + "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + + "
public class Whitelist {\n" +
+            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
+            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
+            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
+            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
+            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
+            "}
\n" + + "

这里定义了标签名/属性名/属性值的白名单。

\n" + + "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + + "
private final class CleaningVisitor implements NodeVisitor {\n" +
+            "    private int numDiscarded = 0;\n" +
+            "    private final Element root;\n" +
+            "    private Element destination; // current element to append nodes to\n" +
+            "\n" +
+            "    private CleaningVisitor(Element root, Element destination) {\n" +
+            "        this.root = root;\n" +
+            "        this.destination = destination;\n" +
+            "    }\n" +
+            "\n" +
+            "    public void head(Node source, int depth) {\n" +
+            "        if (source instanceof Element) {\n" +
+            "            Element sourceEl = (Element) source;\n" +
+            "\n" +
+            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
+            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
+            "                Element destChild = meta.el;\n" +
+            "                destination.appendChild(destChild);\n" +
+            "\n" +
+            "                numDiscarded += meta.numAttribsDiscarded;\n" +
+            "                destination = destChild;\n" +
+            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
+            "                numDiscarded++;\n" +
+            "            }\n" +
+            "        } else if (source instanceof TextNode) {\n" +
+            "            TextNode sourceText = (TextNode) source;\n" +
+            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
+            "            destination.appendChild(destText);\n" +
+            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
+            "            numDiscarded++;\n" +
+            "        }\n" +
+            "    }\n" +
+            "\n" +
+            "    public void tail(Node source, int depth) {\n" +
+            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
+            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
+            "        }\n" +
+            "    }\n" +
+            "}
\n" + + "\n" + + "

结束语

\n" + + "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + + "
    \n" + + "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + + "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + + "
\n" + + "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + + " \t \t \n" + + " \t\n" + + "\t
\n" + + " \t关键字:\n" + + " \t \tJsoup\n" + + " \t \tXSS\n" + + " \t \tOO\n" + + " \t \t
\n" + + "\t \t \n" + + "
\t\t\n" + + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + + "\t \t
\n" + + "\n" + + " \n" + + "\t
\n" + + "\n" + + "\t\n" + + "\t
\n" + + "\t\n" + + "\t\n" + + "\t\t分享到: \n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + " 已有 0人顶\n" + + "\t\n" + + "\t
\n" + + "\t\t\n" + + "
\n" + + "
\n" + + "
\n" + + "

共有 3 条网友评论

\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"静风流云\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    貌似,OSC也是类似处理的。
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"黄亿华\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t
    \n" + + "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"searchjack\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    极好的工具,
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t \n" + + "\t \n" + + "\t 文明上网,理性发言\n" + + "
\n" + + "\t回到页首 | 回到评论列表\n" + + "
\n" + + "
\n" + + "\t\n" + + "
\n" + + "\t关闭相关文章阅读\n" + + "\t\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""; +} From aefd0569a5bfb2f8a99de948ccac38302af19500 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 09:36:56 +0800 Subject: [PATCH 8/8] update version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- .../us/codecraft/webmagic/selector/BaseElementSelector.java | 2 +- .../java/us/codecraft/webmagic/selector/ElementSelector.java | 2 +- .../main/java/us/codecraft/webmagic/selector/XsoupSelector.java | 2 +- .../main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java | 2 +- webmagic-extension/pom.xml | 2 +- .../us/codecraft/webmagic/pipeline/FilePageModelPipeline.java | 2 +- webmagic-samples/pom.xml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index e3bd30e..5b47984 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ef9f84a..9e3d4a2 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index d14a708..e313f24 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -6,7 +6,7 @@ import java.util.List; /** * @author code4crafter@gmail.com - * @since 0.2.2 + * @since 0.3.0 */ public abstract class BaseElementSelector implements Selector,ElementSelector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java index 793b825..e422ac8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java @@ -8,7 +8,7 @@ import java.util.List; * Selector(extractor) for html elements.
* * @author code4crafter@gmail.com
- * @since 0.2.2 + * @since 0.3.0 */ public interface ElementSelector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java index 698b29b..ea46290 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java @@ -10,7 +10,7 @@ import java.util.List; * XPath selector based on Xsoup.
* * @author code4crafter@gmail.com
- * @since 0.2.2 + * @since 0.3.0 */ public class XsoupSelector extends BaseElementSelector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java index 1d63aec..7aa5c13 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java @@ -6,7 +6,7 @@ import java.util.Properties; /** * @author code4crafter@gmail.com - * @since 0.2.2 + * @since 0.3.0 */ public abstract class EnvironmentUtil { diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 1914b71..4cad2b0 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java index d3ed1f0..5586863 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -18,7 +18,7 @@ import java.io.PrintWriter; * Otherwise use SHA1 as file name. * * @author code4crafter@gmail.com
- * @since 0.2.2 + * @since 0.3.0 */ public class FilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a349a68..a620ae5 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.2-SNAPSHOT + 0.3.0-SNAPSHOT 4.0.0