From 81e7f7982e7d40fd6c2701b542893c745be4849f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 20 Jul 2013 08:34:18 +0800 Subject: [PATCH] invite jsoup and cssselector --- webmagic-core/pom.xml | 6 +++ .../java/us/codecraft/webmagic/Spider.java | 2 + .../webmagic/selector/CssSelector.java | 47 +++++++++++++++++++ .../us/codecraft/webmagic/selector/Html.java | 6 +++ .../webmagic/selector/PlainText.java | 5 ++ .../webmagic/selector/Selectable.java | 8 ++++ .../java/us/codecraft/webmagic/HtmlTest.java | 1 + .../webmagic/selector/XpathSelectorTest.java | 2 +- 8 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index df482f7..7d787aa 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -52,6 +52,12 @@ 2.4 + + org.jsoup + jsoup + 1.7.2 + + org.apache.commons commons-io diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index b2a2fa6..1288ff8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -154,9 +154,11 @@ public class Spider implements Runnable, Task { request = scheduler.poll(this); } } else { + //multi thread final AtomicInteger threadAlive = new AtomicInteger(0); while (true) { if (request == null) { + //when no request found but some thread is alive, sleep a while. try { Thread.sleep(100); } catch (InterruptedException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java new file mode 100644 index 0000000..c2d654a --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.selector; + +import org.apache.commons.collections.CollectionUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 上午9:39 + */ +public class CssSelector implements Selector { + + private String selectorText; + + public CssSelector(String selectorText) { + this.selectorText = selectorText; + } + + @Override + public String select(String text) { + Document doc = Jsoup.parse(text); + Elements elements = doc.select(selectorText); + if (CollectionUtils.isNotEmpty(elements)) { + return null; + } + return elements.get(0).outerHtml(); + } + + @Override + public List selectList(String text) { + List strings = new ArrayList(); + Document doc = Jsoup.parse(text); + Elements elements = doc.select(selectorText); + if (CollectionUtils.isNotEmpty(elements)) { + for (Element element : elements) { + strings.add(element.outerHtml()); + } + } + return strings; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 0b36372..099f507 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -62,4 +62,10 @@ public class Html extends PlainText { return selectList(xpathSelector, strings); } + @Override + public Selectable $(String selector) { + CssSelector cssSelector = new CssSelector(selector); + return selectList(cssSelector,strings); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index cedee63..0137de8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -33,6 +33,11 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } + @Override + public Selectable $(String selector) { + throw new UnsupportedOperationException(); + } + @Override public Selectable smartContent() { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 630808d..f4aa9a5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -17,6 +17,14 @@ public interface Selectable { */ public Selectable xpath(String xpath); + /** + * select list with jquery selector + * + * @param + * @return + */ + public Selectable $(String selector); + /** * select smart content with ReadAbility algorithm * diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index fcdbfef..c900014 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -17,4 +17,5 @@ public class HtmlTest { Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index e13b809..3ef0a92 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1351,7 +1351,7 @@ public class XpathSelectorTest { public void testOschina() { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); - System.out.println(html1.regex("(.*?)").links().toStrings()); + Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings()); } }