diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index a623518..ab2b544 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -9,8 +9,8 @@ import java.util.ArrayList;
import java.util.List;
/**
- *
* Object storing extracted result and urls to fetch.
+ * Not thread safe.
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
@@ -19,9 +19,9 @@ import java.util.List;
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
*
* @author code4crafter@gmail.com
- * @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
+ * @since 0.1.0
*/
public class Page {
@@ -31,6 +31,8 @@ public class Page {
private Html html;
+ private String rawText;
+
private Selectable url;
private int statusCode;
@@ -62,9 +64,17 @@ public class Page {
* @return html
*/
public Html getHtml() {
+ if (html == null) {
+ html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
+ }
return html;
}
+ /**
+ * @param html
+ * @deprecated since 0.4.0
+ * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
+ */
public void setHtml(Html html) {
this.html = html;
}
@@ -95,7 +105,7 @@ public class Page {
*
* @param requests
*/
- public void addTargetRequests(List requests,long priority) {
+ public void addTargetRequests(List requests, long priority) {
synchronized (targetRequests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
@@ -172,13 +182,22 @@ public class Page {
this.statusCode = statusCode;
}
+ public String getRawText() {
+ return rawText;
+ }
+
+ public void setRawText(String rawText) {
+ this.rawText = rawText;
+ }
+
@Override
public String toString() {
return "Page{" +
"request=" + request +
", resultItems=" + resultItems +
- ", html=" + html +
+ ", rawText='" + rawText + '\'' +
", url=" + url +
+ ", statusCode=" + statusCode +
", targetRequests=" + targetRequests +
'}';
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 389633f..b6baaa7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -162,7 +162,7 @@ public class HttpClientDownloader implements Downloader {
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page();
- page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
+ page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
index f72efe0..e8ac20c 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
@@ -31,8 +31,7 @@ public class OschinaBlog {
private Date date;
public static void main(String[] args) {
- OOSpider.create(Site.me().setSleepTime(0)
- , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
+ OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
.addUrl("http://my.oschina.net/flashsword/blog").run();
}