#39 Parsing html after page.getHtml()
parent
f63d33b457
commit
a01312930a
|
@ -9,8 +9,8 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* Object storing extracted result and urls to fetch.<br>
|
* Object storing extracted result and urls to fetch.<br>
|
||||||
|
* Not thread safe.<br>
|
||||||
* Main method: <br>
|
* Main method: <br>
|
||||||
* {@link #getUrl()} get url of current page <br>
|
* {@link #getUrl()} get url of current page <br>
|
||||||
* {@link #getHtml()} get content of current page <br>
|
* {@link #getHtml()} get content of current page <br>
|
||||||
|
@ -19,9 +19,9 @@ import java.util.List;
|
||||||
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
|
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
|
||||||
*
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
|
||||||
* @see us.codecraft.webmagic.downloader.Downloader
|
* @see us.codecraft.webmagic.downloader.Downloader
|
||||||
* @see us.codecraft.webmagic.processor.PageProcessor
|
* @see us.codecraft.webmagic.processor.PageProcessor
|
||||||
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class Page {
|
public class Page {
|
||||||
|
|
||||||
|
@ -31,6 +31,8 @@ public class Page {
|
||||||
|
|
||||||
private Html html;
|
private Html html;
|
||||||
|
|
||||||
|
private String rawText;
|
||||||
|
|
||||||
private Selectable url;
|
private Selectable url;
|
||||||
|
|
||||||
private int statusCode;
|
private int statusCode;
|
||||||
|
@ -62,9 +64,17 @@ public class Page {
|
||||||
* @return html
|
* @return html
|
||||||
*/
|
*/
|
||||||
public Html getHtml() {
|
public Html getHtml() {
|
||||||
|
if (html == null) {
|
||||||
|
html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
|
||||||
|
}
|
||||||
return html;
|
return html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param html
|
||||||
|
* @deprecated since 0.4.0
|
||||||
|
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
|
||||||
|
*/
|
||||||
public void setHtml(Html html) {
|
public void setHtml(Html html) {
|
||||||
this.html = html;
|
this.html = html;
|
||||||
}
|
}
|
||||||
|
@ -95,7 +105,7 @@ public class Page {
|
||||||
*
|
*
|
||||||
* @param requests
|
* @param requests
|
||||||
*/
|
*/
|
||||||
public void addTargetRequests(List<String> requests,long priority) {
|
public void addTargetRequests(List<String> requests, long priority) {
|
||||||
synchronized (targetRequests) {
|
synchronized (targetRequests) {
|
||||||
for (String s : requests) {
|
for (String s : requests) {
|
||||||
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
|
||||||
|
@ -172,13 +182,22 @@ public class Page {
|
||||||
this.statusCode = statusCode;
|
this.statusCode = statusCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getRawText() {
|
||||||
|
return rawText;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRawText(String rawText) {
|
||||||
|
this.rawText = rawText;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Page{" +
|
return "Page{" +
|
||||||
"request=" + request +
|
"request=" + request +
|
||||||
", resultItems=" + resultItems +
|
", resultItems=" + resultItems +
|
||||||
", html=" + html +
|
", rawText='" + rawText + '\'' +
|
||||||
", url=" + url +
|
", url=" + url +
|
||||||
|
", statusCode=" + statusCode +
|
||||||
", targetRequests=" + targetRequests +
|
", targetRequests=" + targetRequests +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
|
@ -162,7 +162,7 @@ public class HttpClientDownloader implements Downloader {
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
|
page.setRawText(content);
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
page.setRequest(request);
|
page.setRequest(request);
|
||||||
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
||||||
|
|
|
@ -31,8 +31,7 @@ public class OschinaBlog {
|
||||||
private Date date;
|
private Date date;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().setSleepTime(0)
|
OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
|
||||||
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
|
|
||||||
.addUrl("http://my.oschina.net/flashsword/blog").run();
|
.addUrl("http://my.oschina.net/flashsword/blog").run();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue