From a01312930ae4813595abea87090bf7959940db4e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 27 Nov 2013 22:01:34 +0800 Subject: [PATCH] #39 Parsing html after page.getHtml() --- .../main/java/us/codecraft/webmagic/Page.java | 27 ++++++++++++++++--- .../downloader/HttpClientDownloader.java | 2 +- .../webmagic/example/OschinaBlog.java | 3 +-- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index a623518..ab2b544 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -9,8 +9,8 @@ import java.util.ArrayList; import java.util.List; /** - * * Object storing extracted result and urls to fetch.
+ * Not thread safe.
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
@@ -19,9 +19,9 @@ import java.util.List; * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
- * @since 0.1.0 * @see us.codecraft.webmagic.downloader.Downloader * @see us.codecraft.webmagic.processor.PageProcessor + * @since 0.1.0 */ public class Page { @@ -31,6 +31,8 @@ public class Page { private Html html; + private String rawText; + private Selectable url; private int statusCode; @@ -62,9 +64,17 @@ public class Page { * @return html */ public Html getHtml() { + if (html == null) { + html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl())); + } return html; } + /** + * @param html + * @deprecated since 0.4.0 + * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. + */ public void setHtml(Html html) { this.html = html; } @@ -95,7 +105,7 @@ public class Page { * * @param requests */ - public void addTargetRequests(List requests,long priority) { + public void addTargetRequests(List requests, long priority) { synchronized (targetRequests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { @@ -172,13 +182,22 @@ public class Page { this.statusCode = statusCode; } + public String getRawText() { + return rawText; + } + + public void setRawText(String rawText) { + this.rawText = rawText; + } + @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + - ", html=" + html + + ", rawText='" + rawText + '\'' + ", url=" + url + + ", statusCode=" + statusCode + ", targetRequests=" + targetRequests + '}'; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 389633f..b6baaa7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -162,7 +162,7 @@ public class HttpClientDownloader implements Downloader { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); - page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); + page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java index f72efe0..e8ac20c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -31,8 +31,7 @@ public class OschinaBlog { private Date date; public static void main(String[] args) { - OOSpider.create(Site.me().setSleepTime(0) - , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) + OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) .addUrl("http://my.oschina.net/flashsword/blog").run(); }