diff --git a/README.md b/README.md index 47c93f6..5624019 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ webmagic [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) ->A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. +>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. ## Features: @@ -16,24 +16,18 @@ webmagic ## Install: - -Clone the repo and build: - - git clone https://github.com/code4craft/webmagic.git - cd webmagic - mvn clean install - -Add dependencies to your project: + +Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.2.0 + 0.3.0 us.codecraft webmagic-extension - 0.2.0 + 0.3.0 ## Get Started: @@ -42,6 +36,7 @@ Add dependencies to your project: Write a class implements PageProcessor: +```java public class OschinaBlogPageProcesser implements PageProcessor { private Site site = Site.me().setDomain("my.oschina.net") @@ -67,6 +62,7 @@ Write a class implements PageProcessor: .pipeline(new ConsolePipeline()).run(); } } +``` * `page.addTargetRequests(links)` @@ -74,6 +70,7 @@ Write a class implements PageProcessor: You can also use annotation way: +```java @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog { @@ -92,6 +89,7 @@ You can also use annotation way: new ConsolePageModelPipeline(), OschinaBlog.class).run(); } } +``` ### Docs and samples: diff --git a/pom.xml b/pom.xml index cd78657..8b3c987 100644 --- a/pom.xml +++ b/pom.xml @@ -6,9 +6,13 @@ 7 us.codecraft - 0.2.1 + 0.3.1-SNAPSHOT 4.0.0 pom + + UTF-8 + UTF-8 + webmagic-parent webmagic-parent @@ -32,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.2.1 + HEAD @@ -44,7 +48,6 @@ webmagic-core webmagic-extension/ - webmagic-samples/ @@ -60,6 +63,11 @@ httpclient 4.2.4 + + us.codecraft + xsoup + 0.1.0 + net.sf.saxon Saxon-HE diff --git a/release-note.md b/release-note.md index ee3a962..001568b 100755 --- a/release-note.md +++ b/release-note.md @@ -1,5 +1,19 @@ Release Notes ---- +*2012-9-4* `version:0.3.0` + +* Change default XPath selector from HtmlCleaner to [Xsoup](https://github.com/code4craft/xsoup). + + [Xsoup](https://github.com/code4craft/xsoup) is an XPath selector based on Jsoup written by me. It has much better performance than HtmlCleaner. + + Time of processing a page is reduced from 7~9ms to 0.4ms. + + If Xsoup is not stable for your usage, just use `Spider.xsoupOff()` to turn off it and report an issue to me! + +* Add cycle retry times for Site. + + When cycle retry times is set, Spider will put the url which downloading failed back to scheduler, and retry after a cycle of queue. + *2012-8-20* `version:0.2.1` ComboExtractor support for annotation. diff --git a/webmagic manual.md b/webmagic manual.md index 0f4d4e4..dc09b90 100644 --- a/webmagic manual.md +++ b/webmagic manual.md @@ -21,22 +21,17 @@ webmagic使用手册 ### 使用maven -webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: - - git clone https://github.com/code4craft/webmagic.git - mvn clean install - -安装后,在项目中添加对应的依赖即可使用webmagic: +webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: us.codecraft webmagic-core - 0.2.0 + 0.2.1 us.codecraft webmagic-extension - 0.2.0 + 0.2.1 #### 项目结构 @@ -51,7 +46,7 @@ webmagic主要包括两个包: webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 -webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译: * **webmagic-saxon** diff --git a/webmagic-core/module_webmagic-core.xml b/webmagic-core/module_webmagic-core.xml new file mode 100644 index 0000000..88c4cfa --- /dev/null +++ b/webmagic-core/module_webmagic-core.xml @@ -0,0 +1,156 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 0679106..eb4a751 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.1 + 0.3.1-SNAPSHOT 4.0.0 @@ -25,6 +25,11 @@ commons-lang3 + + us.codecraft + xsoup + + log4j log4j diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index afdf232..0821e6d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; @@ -28,7 +29,7 @@ public class Page { private ResultItems resultItems = new ResultItems(); - private Selectable html; + private Html html; private Selectable url; @@ -58,11 +59,11 @@ public class Page { * * @return html */ - public Selectable getHtml() { + public Html getHtml() { return html; } - public void setHtml(Selectable html) { + public void setHtml(Html html) { this.html = html; } @@ -87,6 +88,23 @@ public class Page { } } + /** + * add urls to fetch + * + * @param requests + */ + public void addTargetRequests(List requests,long priority) { + synchronized (targetRequests) { + for (String s : requests) { + if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { + break; + } + s = UrlUtils.canonicalizeUrl(s, url.toString()); + targetRequests.add(new Request(s).setPriority(priority)); + } + } + } + /** * add url to fetch * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 694d32b..142a20c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -17,6 +17,8 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; + public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; + private String url; /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 443f2bb..6a35178 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -30,6 +30,8 @@ public class Site { private int retryTimes = 0; + private int cycleRetryTimes = 0; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; @@ -200,7 +202,7 @@ public class Site { } /** - * Get retry times when download fail, 0 by default.
+ * Get retry times when download fail immediately, 0 by default.
* * @return retry times when download fail */ @@ -218,6 +220,25 @@ public class Site { return this; } + /** + * When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
+ * + * @return retry times when download fail + */ + public int getCycleRetryTimes() { + return cycleRetryTimes; + } + + /** + * Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler.
+ * + * @return this + */ + public Site setCycleRetryTimes(int cycleRetryTimes) { + this.cycleRetryTimes = cycleRetryTimes; + return this; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index c5c239f..47cefd0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -9,6 +9,7 @@ import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.utils.EnvironmentUtil; import us.codecraft.webmagic.utils.ThreadUtils; import java.io.Closeable; @@ -309,6 +310,12 @@ public class Spider implements Runnable, Task { sleep(site.getSleepTime()); return; } + //for cycle retry + if (page.getHtml()==null){ + addRequest(page); + sleep(site.getSleepTime()); + return; + } pageProcessor.process(page); addRequest(page); if (!page.getResultItems().isSkip()) { @@ -368,6 +375,14 @@ public class Spider implements Runnable, Task { return this; } + /** + * switch off xsoup + * @return + */ + public static void xsoupOff(){ + EnvironmentUtil.setUseXsoup(false); + } + @Override public String getUUID() { if (uuid != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 7563410..82a4a9a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -46,6 +46,17 @@ public class HttpClientDownloader implements Downloader { return (Html) page.getHtml(); } + /** + * A simple method to download a url. + * + * @param url + * @return html + */ + public Html download(String url, String charset) { + Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); + return (Html) page.getHtml(); + } + @Override public Page download(Request request, Task task) { Site site = null; @@ -79,6 +90,21 @@ public class HttpClientDownloader implements Downloader { if (tried > retryTimes) { logger.warn("download page " + request.getUrl() + " error", e); + if (site.getCycleRetryTimes() > 0) { + Page page = new Page(); + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes >= site.getCycleRetryTimes()) { + return null; + } + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } + return page; + } return null; } logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); @@ -87,13 +113,12 @@ public class HttpClientDownloader implements Downloader { } while (retry); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (acceptStatCode.contains(statusCode)) { + handleGzip(httpResponse); //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); charset = UrlUtils.getCharset(value); } - // - handleGzip(httpResponse); return handleResponse(request, charset, httpResponse, task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java new file mode 100644 index 0000000..e313f24 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; + +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.3.0 + */ +public abstract class BaseElementSelector implements Selector,ElementSelector { + + @Override + public String select(String text) { + return select(Jsoup.parse(text)); + } + + @Override + public List selectList(String text) { + return selectList(Jsoup.parse(text)); + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 5031077..9c7032c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,8 +1,6 @@ package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -15,7 +13,7 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class CssSelector implements Selector { +public class CssSelector extends BaseElementSelector { private String selectorText; @@ -30,16 +28,6 @@ public class CssSelector implements Selector { this.attrName = attrName; } - @Override - public String select(String text) { - Document doc = Jsoup.parse(text); - Elements elements = doc.select(selectorText); - if (CollectionUtils.isEmpty(elements)) { - return null; - } - return getValue(elements.get(0)); - } - private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); @@ -51,9 +39,17 @@ public class CssSelector implements Selector { } @Override - public List selectList(String text) { + public String select(Element element) { + Elements elements = element.select(selectorText); + if (CollectionUtils.isEmpty(elements)) { + return null; + } + return getValue(elements.get(0)); + } + + @Override + public List selectList(Element doc) { List strings = new ArrayList(); - Document doc = Jsoup.parse(text); Elements elements = doc.select(selectorText); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java new file mode 100644 index 0000000..e422ac8 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.List; + +/** + * Selector(extractor) for html elements.
+ * + * @author code4crafter@gmail.com
+ * @since 0.3.0 + */ +public interface ElementSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param element + * @return result + */ + public String select(Element element); + + /** + * Extract all results in text.
+ * + * @param element + * @return results + */ + public List selectList(Element element); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 1d5e8c5..b9b7f02 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -1,5 +1,10 @@ package us.codecraft.webmagic.selector; +import org.apache.log4j.Logger; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import us.codecraft.webmagic.utils.EnvironmentUtil; + import java.util.ArrayList; import java.util.List; @@ -11,12 +16,29 @@ import java.util.List; */ public class Html extends PlainText { + private Logger logger = Logger.getLogger(getClass()); + + /** + * Store parsed document for better performance when only one text exist. + */ + private Document document; + public Html(List strings) { super(strings); } public Html(String text) { super(text); + try { + this.document = Jsoup.parse(text); + } catch (Exception e) { + logger.warn("parse document error ", e); + } + } + + public Html(Document document) { + super(document.html()); + this.document = document; } public static Html create(String text) { @@ -47,32 +69,77 @@ public class Html extends PlainText { @Override public Selectable smartContent() { - SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); + SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, strings); } @Override public Selectable links() { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); - return selectList(xpathSelector, strings); + return xpath("//a/@href"); } @Override public Selectable xpath(String xpath) { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); - return selectList(xpathSelector, strings); + if (EnvironmentUtil.useXsoup()) { + XsoupSelector xsoupSelector = new XsoupSelector(xpath); + if (document != null) { + return new Html(xsoupSelector.selectList(document)); + } + return selectList(xsoupSelector, strings); + } else { + XpathSelector xpathSelector = new XpathSelector(xpath); + return selectList(xpathSelector, strings); + } } @Override public Selectable $(String selector) { - CssSelector cssSelector = new CssSelector(selector); + CssSelector cssSelector = Selectors.$(selector); + if (document != null) { + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } @Override public Selectable $(String selector, String attrName) { - CssSelector cssSelector = new CssSelector(selector, attrName); + CssSelector cssSelector = Selectors.$(selector, attrName); + if (document != null) { + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } + public Document getDocument() { + return document; + } + + public String getText() { + if (strings != null && strings.size() > 0) { + return strings.get(0); + } + return document.html(); + } + + /** + * @param selector + * @return + */ + public String selectDocument(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.select(getDocument()); + } else { + return selector.select(getText()); + } + } + + public List selectDocumentForList(Selector selector) { + if (selector instanceof ElementSelector) { + ElementSelector elementSelector = (ElementSelector) selector; + return elementSelector.selectList(getDocument()); + } else { + return selector.selectList(getText()); + } + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index e0501eb..9406f3a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -57,13 +57,13 @@ public class PlainText implements Selectable { @Override public Selectable regex(String regex) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); + RegexSelector regexSelector = Selectors.regex(regex); return selectList(regexSelector, strings); } @Override public Selectable regex(String regex, int group) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group); + RegexSelector regexSelector = Selectors.regex(regex, group); return selectList(regexSelector, strings); } @@ -89,7 +89,7 @@ public class PlainText implements Selectable { @Override public Selectable replace(String regex, String replacement) { - ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); + ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); return select(replaceSelector, strings); } @@ -106,4 +106,9 @@ public class PlainText implements Selectable { return null; } } + + @Override + public boolean match() { + return strings != null && strings.size() > 0; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 21c9381..66df5d5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -82,6 +82,13 @@ public interface Selectable { */ public String toString(); + /** + * if result exist for select + * + * @return true if result exist + */ + public boolean match(); + /** * multi string result * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index b52d128..9764641 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -9,11 +9,15 @@ package us.codecraft.webmagic.selector; public abstract class Selectors { public static RegexSelector regex(String expr) { - return SelectorFactory.getInstatnce().newRegexSelector(expr); + return new RegexSelector(expr); } public static RegexSelector regex(String expr, int group) { - return SelectorFactory.getInstatnce().newRegexSelector(expr, group); + return new RegexSelector(expr,group); + } + + public static SmartContentSelector smartContent() { + return new SmartContentSelector(); } public static CssSelector $(String expr) { @@ -25,7 +29,11 @@ public abstract class Selectors { } public static XpathSelector xpath(String expr) { - return SelectorFactory.getInstatnce().newXpathSelector(expr); + return new XpathSelector(expr); + } + + public static XsoupSelector xsoup(String expr) { + return new XsoupSelector(expr); } public static AndSelector and(Selector... selectors) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java new file mode 100644 index 0000000..ea46290 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; + +import java.util.List; + +/** + * XPath selector based on Xsoup.
+ * + * @author code4crafter@gmail.com
+ * @since 0.3.0 + */ +public class XsoupSelector extends BaseElementSelector { + + private XPathEvaluator xPathEvaluator; + + public XsoupSelector(String xpathStr) { + this.xPathEvaluator = Xsoup.compile(xpathStr); + } + + @Override + public String select(Element element) { + return xPathEvaluator.evaluate(element).get(); + } + + @Override + public List selectList(Element element) { + return xPathEvaluator.evaluate(element).list(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java new file mode 100644 index 0000000..7aa5c13 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/EnvironmentUtil.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.utils; + +import org.apache.commons.lang3.BooleanUtils; + +import java.util.Properties; + +/** + * @author code4crafter@gmail.com + * @since 0.3.0 + */ +public abstract class EnvironmentUtil { + + private static final String USE_XSOUP = "xsoup"; + + public static boolean useXsoup() { + Properties properties = System.getProperties(); + Object o = properties.get(USE_XSOUP); + if (o == null) { + return true; + } + return BooleanUtils.toBoolean(((String) o).toLowerCase()); + } + + public static void setUseXsoup(boolean useXsoup) { + Properties properties = System.getProperties(); + properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false")); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 7dae1f2..4e1140b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -98,15 +99,17 @@ public class UrlUtils { return stringBuilder.toString(); } - private static final Pattern patternForCharset = Pattern.compile("charset=([^\\s;]*)"); + private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { - return matcher.group(1); - } else { - return null; + String charset = matcher.group(1); + if (Charset.isSupported(charset)) { + return charset; + } } + return null; } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java new file mode 100644 index 0000000..cb620e7 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/EnvironmentUtilTest.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Test; + +import static junit.framework.Assert.*; + +/** + * @author code4crafter@gmail.com + */ +public class EnvironmentUtilTest { + + @Test + public void test() { + assertTrue(EnvironmentUtil.useXsoup()); + EnvironmentUtil.setUseXsoup(false); + assertFalse(EnvironmentUtil.useXsoup()); + } +} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 29ad49d..4cdf001 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.2.1 + 0.3.1-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index a16c7a1..03cd3a3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -34,7 +34,7 @@ class PageModelExtractor { private List fieldExtractors; - private Extractor extractor; + private Extractor objectExtractor; public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); @@ -169,7 +169,7 @@ class PageModelExtractor { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); } } @@ -183,28 +183,28 @@ class PageModelExtractor { if (!matched) { return null; } - if (extractor == null) { - return processSingle(page, page.getHtml().toString()); + if (objectExtractor == null) { + return processSingle(page, null, false); } else { - if (extractor.multi) { + if (objectExtractor.multi) { List os = new ArrayList(); - List list = extractor.getSelector().selectList(page.getHtml().toString()); + List list = objectExtractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { - Object o = processSingle(page, s); + Object o = processSingle(page, s, false); if (o != null) { os.add(o); } } return os; } else { - String select = extractor.getSelector().select(page.getHtml().toString()); - Object o = processSingle(page, select); + String select = objectExtractor.getSelector().select(page.getHtml().toString()); + Object o = processSingle(page, select, false); return o; } } } - private Object processSingle(Page page, String html) { + private Object processSingle(Page page, String html, boolean isRaw) { Object o = null; try { o = clazz.newInstance(); @@ -213,10 +213,14 @@ class PageModelExtractor { List value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().selectList(html); + if (isRaw) { + value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().selectList(html); + } break; case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); @@ -232,10 +236,14 @@ class PageModelExtractor { String value; switch (fieldExtractor.getSource()) { case RawHtml: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); break; case Html: - value = fieldExtractor.getSelector().select(html); + if (isRaw) { + value = page.getHtml().selectDocument(fieldExtractor.getSelector()); + } else { + value = fieldExtractor.getSelector().select(html); + } break; case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java new file mode 100644 index 0000000..5586863 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -0,0 +1,55 @@ +package us.codecraft.webmagic.pipeline; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.HasKey; +import us.codecraft.webmagic.model.PageModelPipeline; +import us.codecraft.webmagic.utils.FilePersistentBase; + +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * Store results objects (page models) to files in plain format.
+ * Use model.getKey() as file name if the model implements HasKey.
+ * Otherwise use SHA1 as file name. + * + * @author code4crafter@gmail.com
+ * @since 0.3.0 + */ +public class FilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { + + private Logger logger = Logger.getLogger(getClass()); + + /** + * new JsonFilePageModelPipeline with default path "/data/webmagic/" + */ + public FilePageModelPipeline() { + setPath("/data/webmagic/"); + } + + public FilePageModelPipeline(String path) { + setPath(path); + } + + @Override + public void process(Object o, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + try { + String filename; + if (o instanceof HasKey) { + filename = path + ((HasKey) o).key() + ".html"; + } else { + filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".html"; + } + PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename))); + printWriter.write(ToStringBuilder.reflectionToString(o)); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e191627..cd90625 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler { public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); try { - //使用Set进行url去重 - if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { - //使用List保存队列 + // if cycleRetriedTimes is set, allow duplicated. + Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES); + // use set to remove duplicate url + if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { + // use list to store queue jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl()); if (request.getExtras() != null) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 5c6ebbf..2d9fd51 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -1,16 +1,14 @@ package us.codecraft.webmagic.utils; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.selector.CssSelector; -import us.codecraft.webmagic.selector.RegexSelector; -import us.codecraft.webmagic.selector.Selector; -import us.codecraft.webmagic.selector.XpathSelector; +import us.codecraft.webmagic.selector.*; import java.util.ArrayList; import java.util.List; /** * Tools for annotation converting.
+ * * @author code4crafter@gmail.com
* @since 0.2.1 */ @@ -27,17 +25,27 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = new XpathSelector(value); + selector = getXpathSelector(value); break; default: - selector = new XpathSelector(value); + selector = getXpathSelector(value); + } + return selector; + } + + private static Selector getXpathSelector(String value) { + Selector selector; + if (EnvironmentUtil.useXsoup()) { + selector = new XsoupSelector(value); + } else { + selector = new XpathSelector(value); } return selector; } public static List getSelectors(ExtractBy[] extractBies) { List selectors = new ArrayList(); - if (extractBies==null){ + if (extractBies == null) { return selectors; } for (ExtractBy extractBy : extractBies) { diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 35ddcaa..a42a719 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1 + 0.3.1-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java new file mode 100644 index 0000000..7239e36 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.model.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.AfterExtractor; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.TargetUrl; + +import java.util.List; + +/** + * @author yihua.huang@dianping.com
+ * Date: 13-8-13
+ * Time: 上午10:13
+ */ +@TargetUrl("http://*.alpha.dp/*") +public class DianpingFtlDataScanner implements AfterExtractor { + + @ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true) + private List data; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class) + .thread(5).run(); + } + + @Override + public void afterProcess(Page page) { + if (data.size() > 1) { + System.err.println(page.getUrl()); + } + if (data.size() > 0 && data.get(0).length() > 100) { + System.err.println(page.getUrl()); + } + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 115f183..3ceba0a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.PlainText; @@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor { page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody")); + page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } @@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor { } return site; } + + public static void main(String[] args) { + Spider.create(new DiaoyuwengProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 4ffe127..3d27be8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -2,7 +2,9 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -15,14 +17,18 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all(); + List strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("(.*)")); - page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); + page.putField("body",page.getHtml().xpath("//dd")); } @Override public Site getSite() { - return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. + } + + public static void main(String[] args) { + Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 89b74d6..136eeb8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -14,10 +15,9 @@ import java.util.List; public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").all(); + List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); page.putField("content",page.getHtml().smartContent()); } @@ -26,4 +26,8 @@ public class HuxiuProcessor implements PageProcessor { return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new HuxiuProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index b43c3c5..38de3bc 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -4,9 +4,7 @@ import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -41,8 +39,6 @@ public class InfoQMiniBookProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new InfoQMiniBookProcessor()) - .scheduler(new RedisScheduler("localhost")) - .pipeline(new FilePipeline("/data/temp/webmagic/")) .thread(5) .run(); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java index c0b3f73..f80f895 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java @@ -3,7 +3,6 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -32,6 +31,6 @@ public class IteyeBlogProcessor implements PageProcessor { } public static void main(String[] args) { - Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run(); + Spider.create(new IteyeBlogProcessor()).thread(5).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index aff18a6..0ab6c64 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** @@ -24,4 +25,8 @@ public class KaichibaProcessor implements PageProcessor { return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new KaichibaProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index a4e6e43..bfa347d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +22,8 @@ public class MeicanProcessor implements PageProcessor { } page.addTargetRequests(requests); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); - page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); + page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); } @Override @@ -30,4 +31,8 @@ public class MeicanProcessor implements PageProcessor { return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new MeicanProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 8ba7063..e447003 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -1,9 +1,8 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -21,8 +20,8 @@ public class OschinaBlogPageProcesser implements PageProcessor { public void process(Page page) { List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); + page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @@ -33,6 +32,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).pipeline(new ConsolePipeline()).run(); + Spider.create(new OschinaBlogPageProcesser()).run(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java new file mode 100644 index 0000000..a52b3d4 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.samples.scheduler; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.PriorityScheduler; + +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.DelayQueue; +import java.util.concurrent.Delayed; +import java.util.concurrent.TimeUnit; + +/** + * @author code4crafter@gmail.com + */ +public class DelayQueueScheduler extends PriorityScheduler { + + private DelayQueue queue = new DelayQueue(); + + private Set urls = new HashSet(); + + private long time; + + private TimeUnit timeUnit; + + private class RequestWrapper implements Delayed { + + private long startTime = System.currentTimeMillis(); + + private Request request; + + private RequestWrapper(Request request) { + this.request = request; + } + + private long getStartTime() { + return startTime; + } + + private Request getRequest() { + return request; + } + + @Override + public long getDelay(TimeUnit unit) { + long convert = unit.convert(TimeUnit.MILLISECONDS.convert(time, timeUnit) - System.currentTimeMillis() + startTime, TimeUnit.MILLISECONDS); + return convert; + } + + @Override + public int compareTo(Delayed o) { + return new Long(getDelay(TimeUnit.MILLISECONDS)).compareTo(o.getDelay(TimeUnit.MILLISECONDS)); + } + } + + public DelayQueueScheduler(long time, TimeUnit timeUnit) { + this.time = time; + this.timeUnit = timeUnit; + } + + @Override + public synchronized void push(Request request, Task task) { + if (urls.add(request.getUrl())) { + queue.add(new RequestWrapper(request)); + } + + } + + @Override + public synchronized Request poll(Task task) { + RequestWrapper take = null; + while (take == null) { + try { + take = queue.take(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + queue.add(new RequestWrapper(take.getRequest())); + return take.getRequest(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java new file mode 100644 index 0000000..79ef209 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.samples.scheduler; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.PriorityScheduler; + +/** + * @author code4crafter@gmail.com + */ +public class LevelLimitScheduler extends PriorityScheduler { + + private int levelLimit = 3; + + public LevelLimitScheduler(int levelLimit) { + this.levelLimit = levelLimit; + } + + @Override + public synchronized void push(Request request, Task task) { + if (((Integer) request.getExtra("_level")) <= levelLimit) { + super.push(request, task); + } + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java new file mode 100644 index 0000000..ddbaa08 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -0,0 +1,89 @@ +package us.codecraft.webmagic.samples.scheduler; + +import org.apache.commons.lang3.StringUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.PriorityScheduler; + +import java.util.List; + +import static us.codecraft.webmagic.selector.Selectors.regex; +import static us.codecraft.webmagic.selector.Selectors.xpath; + +/** + * @author code4crafter@gmail.com + */ +public class ZipCodePageProcessor implements PageProcessor { + + private Site site = Site.me().setCharset("gb2312") + .setSleepTime(100).addStartUrl("http://www.ip138.com/post/"); + + @Override + public void process(Page page) { + if (page.getUrl().toString().equals("http://www.ip138.com/post/")) { + processCountry(page); + } else if (page.getUrl().regex("http://www\\.ip138\\.com/post/\\w+[/]?$").toString() != null) { + processProvince(page); + } else { + processDistrict(page); + } + + } + + private void processCountry(Page page) { + List provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all(); + for (String province : provinces) { + String link = xpath("//@href").select(province); + String title = xpath("/text()").select(province); + Request request = new Request(link).setPriority(0).putExtra("province", title); + page.addTargetRequest(request); + } + } + + private void processProvince(Page page) { + //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉 + List districts = page.getHtml().xpath("//body/table/tbody/tr/td").regex(".*http://www\\.ip138\\.com/post/\\w+/\\w+.*").all(); + for (String district : districts) { + String link = xpath("//@href").select(district); + String title = xpath("/text()").select(district); + Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); + page.addTargetRequest(request); + } + } + + private void processDistrict(Page page) { + String province = page.getRequest().getExtra("province").toString(); + String district = page.getRequest().getExtra("district").toString(); + List counties = page.getHtml().xpath("//body/table/tbody/tr").regex(".*\\d+.*").all(); + String regex = "]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)"; + for (String county : counties) { + String county0 = regex(regex, 1).select(county); + String county1 = regex(regex, 2).select(county); + String zipCode = regex(regex, 3).select(county); + page.putField("result", StringUtils.join(new String[]{province, district, + county0, county1, zipCode}, "\t")); + } + List links = page.getHtml().links().regex("http://www\\.ip138\\.com/post/\\w+/\\w+").all(); + for (String link : links) { + page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); + } + + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).run(); + + PriorityScheduler scheduler = new PriorityScheduler(); + Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(scheduler); + scheduler.push(new Request("http://www.baidu.com/s?wd=webmagic&f=12&rsp=0&oq=webmagix&tn=baiduhome_pg&ie=utf-8"),spider); + spider.run(); + } +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java new file mode 100644 index 0000000..5513305 --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java @@ -0,0 +1,891 @@ +package us.codecraft.webmagic.model; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.samples.OschinaBlog; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * @author code4crafter@gmail.com + */ +public class ProcessorBenchmark { + + @Ignore + @Test + public void test() { + ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class); + Page page = new Page(); + page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); + page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); + page.setHtml(new Html(html)); + long time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + time = System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + modelPageProcessor.process(page); + } + System.out.println(System.currentTimeMillis() - time); + } + + private String html = "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t\t
\n" + + " \t开源中国社区\n" + + "
\n" + + "
开源项目发现、使用和交流平台
\n" + + "\t\t
\n" + + " \t\n" + + "
\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\t\t
\n" + + "\t\t当前访客身份:\n" + + "\t\t\t\t黄亿华 [ 退出 ]\n" + + "\t\t\t\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\n" + + "\t\t
\n" + + "\t\t
\n" + + " \t\t
\n" + + "\t\t\t\t\n" + + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + + " \t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\t
\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + " \t\t\t\t\n" + + "
软件
\n" + + " \n" + + "
\n" + + "\t\t\t\t\t\t\t\n" + + " \t\t
\n" + + "\t\t
\n" + + "\t\t
\n" + + "\t
\n" + + "\t
\t\n" + + "\n" + + "
\n" + + "
\n" + + "\t\t切换风格 \"黄亿华\"\n" + + " \n" + + " 黄亿华\n" + + "\t\t\n" + + "\t\t\t\n" + + " \t\t\t修改资料\n" + + "\t\t\t更换头像\n" + + " \t\t\n" + + " \n" + + "
\n" + + "
\n" + + " \t关注(43)\n" + + " \t粉丝(98)\n" + + " \t积分(173)\n" + + "
\n" + + "
\n" + + "
\n" + + "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t.发表博文\n" + + "\t.空间管理\n" + + "
\n" + + " 管理» 博客分类\n" + + " \n" + + "
\n" + + "
\n" + + " 管理» 最新评论 \n" + + "
    \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@searchjack:极好的工具,\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t\t
  • \n" + + "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + + "\t\t查看»\n" + + "\t
  • \n" + + "\t
\n" + + "
\n" + + "
\n" + + "访客统计\n" + + "
    \n" + + "\t
  • 6 (查看最新访客»)
  • \n" + + "
  • 284
  • \n" + + "
  • 817
  • \n" + + "
  • 1888
  • \n" + + "
  • 16453
  • \n" + + "
\n" + + "
\n" + + "\n" + + "
\n" + + "\t
\n" + + " \t\n" + + "\t
\n" + + "\t\n" + + " \t
\t\t\n" + + "
\n" + + "

Jsoup代码解读之八-防御XSS攻击

\n" + + "
\n" + + " \t\t \t\t \t\t\n" + + " \t\t\t编辑 | 删除\n" + + " \t\t\n" + + "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + + " \t\t已有1628次阅读 ,共3个评论\n" + + " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + + "
\n" + + "\t \t
\n" + + "

目录:[ - ]

\n" + + " \n" + + " \t
\n" + + " \n" + + "\t \t

\n" + + "\n" + + "

防御XSS攻击的一般原理

\n" + + "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + + "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + + "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + + "
    \n" + + "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + + "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + + "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + + "
\n" + + "\n" + + "

Cleaner与Whitelist

\n" + + "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + + "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + + "
public class Whitelist {\n" +
+            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
+            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
+            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
+            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
+            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
+            "}
\n" + + "

这里定义了标签名/属性名/属性值的白名单。

\n" + + "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + + "
private final class CleaningVisitor implements NodeVisitor {\n" +
+            "    private int numDiscarded = 0;\n" +
+            "    private final Element root;\n" +
+            "    private Element destination; // current element to append nodes to\n" +
+            "\n" +
+            "    private CleaningVisitor(Element root, Element destination) {\n" +
+            "        this.root = root;\n" +
+            "        this.destination = destination;\n" +
+            "    }\n" +
+            "\n" +
+            "    public void head(Node source, int depth) {\n" +
+            "        if (source instanceof Element) {\n" +
+            "            Element sourceEl = (Element) source;\n" +
+            "\n" +
+            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
+            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
+            "                Element destChild = meta.el;\n" +
+            "                destination.appendChild(destChild);\n" +
+            "\n" +
+            "                numDiscarded += meta.numAttribsDiscarded;\n" +
+            "                destination = destChild;\n" +
+            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
+            "                numDiscarded++;\n" +
+            "            }\n" +
+            "        } else if (source instanceof TextNode) {\n" +
+            "            TextNode sourceText = (TextNode) source;\n" +
+            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
+            "            destination.appendChild(destText);\n" +
+            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
+            "            numDiscarded++;\n" +
+            "        }\n" +
+            "    }\n" +
+            "\n" +
+            "    public void tail(Node source, int depth) {\n" +
+            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
+            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
+            "        }\n" +
+            "    }\n" +
+            "}
\n" + + "\n" + + "

结束语

\n" + + "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + + "
    \n" + + "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + + "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + + "
\n" + + "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + + " \t \t \n" + + " \t\n" + + "\t
\n" + + " \t关键字:\n" + + " \t \tJsoup\n" + + " \t \tXSS\n" + + " \t \tOO\n" + + " \t \t
\n" + + "\t \t \n" + + "
\t\t\n" + + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + + "\t \t
\n" + + "\n" + + " \n" + + "\t
\n" + + "\n" + + "\t\n" + + "\t
\n" + + "\t\n" + + "\t\n" + + "\t\t分享到: \n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + " 已有 0人顶\n" + + "\t\n" + + "\t
\n" + + "\t\t\n" + + "
\n" + + "
\n" + + "
\n" + + "

共有 3 条网友评论

\n" + + "\t\t\t
    \n" + + "\t\t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"静风流云\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    貌似,OSC也是类似处理的。
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"黄亿华\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t
    \n" + + "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t\t
  • \n" + + "\t\n" + + "\t\n" + + "\t\n" + + "\t
    \n" + + "\t\t\"searchjack\"\t\t\t\n" + + "\t\n" + + "\t\t
    \n" + + "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + + " \t \t 删除\n" + + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + + "\t\t\t\t\t
    \n" + + "\t\t
    极好的工具,
    \n" + + "\t\t
    \n" + + "
    \n" + + "
  • \t\t\t\t
\n" + + "
\n" + + "\t
\n" + + "\n" + + "\n" + + "
\n" + + " \n" + + "
\n" + + "
\n" + + "
\n" + + "\t \n" + + "\t \n" + + "\t 文明上网,理性发言\n" + + "
\n" + + "\t回到页首 | 回到评论列表\n" + + "
\n" + + "
\n" + + "\t\n" + + "
\n" + + "\t关闭相关文章阅读\n" + + "\t\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "\t
\n" + + "\t
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + + "\t开源中国手机客户端:\n" + + "\tAndroid\n" + + "\tiPhone\n" + + "\tWP7\n" + + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""; +} diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java new file mode 100644 index 0000000..31af3b2 --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.samples.scheduler; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Request; + +import java.util.concurrent.TimeUnit; + +/** + * @author code4crafter@gmail.com + */ +public class DelayQueueSchedulerTest { + + @Ignore("infinite") + @Test + public void test() { + DelayQueueScheduler delayQueueScheduler = new DelayQueueScheduler(1, TimeUnit.SECONDS); + delayQueueScheduler.push(new Request("1"), null); + while (true){ + Request poll = delayQueueScheduler.poll(null); + System.out.println(System.currentTimeMillis()+"\t"+poll); + } + } +} diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index efa8291..1c4e745 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -17,6 +17,11 @@ webmagic-core ${project.version} + + us.codecraft + xsoup + 0.0.1-SNAPSHOT + net.sf.saxon Saxon-HE diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index b623040..6c19c8a 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,8 +1,15 @@ package us.codecraft.webmagic.selector; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.htmlcleaner.XPatherException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 @@ -1353,6 +1360,7 @@ public class XpathSelectorTest { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); + Selectors.xpath("/abc/").select(""); } @Test @@ -1379,17 +1387,86 @@ public class XpathSelectorTest { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + XpathSelector xpathSelector = new XpathSelector("//a"); time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } System.out.println(System.currentTimeMillis()-time); + time =System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } + System.out.println(System.currentTimeMillis() - time); + + CssSelector cssSelector = new CssSelector("a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 1000; i++) { + cssSelector.selectList(html); + } + System.out.println("css "+(System.currentTimeMillis()-time)); + } + + @Ignore("take long time") + @Test + public void parserPerformanceTest() throws XPatherException { + System.out.println(html.length()); + + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(html); + Document document = Jsoup.parse(html); + + long time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + Jsoup.parse(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + document.select("a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + htmlCleaner.clean(html); + } + System.out.println(System.currentTimeMillis()-time); + + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + tagNode.evaluateXPath("//a"); + } + System.out.println(System.currentTimeMillis()-time); + + System.out.println("============="); + + XPathEvaluator compile = Xsoup.compile("//a"); + time =System.currentTimeMillis(); + for (int i = 0; i < 2000; i++) { + compile.evaluate(document); + } + System.out.println(System.currentTimeMillis()-time); + } } diff --git a/zh_docs/README.md b/zh_docs/README.md index 31eb2ba..0ef0b4d 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -29,23 +29,18 @@ Java爬虫 **Spiderman** [https://gitcafe.com/laiweiwei/Spiderman](https://gitca ### 使用maven -webmagic使用maven管理依赖,你可以直接下载webmagic源码进行编译: - - git clone https://github.com/code4craft/webmagic.git - cd webmagic - mvn clean install - -安装后,在项目中添加对应的依赖即可使用webmagic: +webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: us.codecraft webmagic-core - 0.2.0 + 0.3.0 us.codecraft webmagic-extension - 0.2.0 + 0.3.0 + #### 项目结构 @@ -60,7 +55,7 @@ webmagic主要包括两个包: webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 -webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来: +webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译:: * **webmagic-saxon**