ConfigurablePageProcessor #91

master
yihua.huang 2014-04-05 23:40:10 +08:00
parent 1090d070d9
commit 9b2cb43f47
8 changed files with 213 additions and 84 deletions

View File

@ -131,6 +131,7 @@ public class Html extends PlainText {
}
public Document getDocument() {
initDocument();
return document;
}

View File

@ -0,0 +1,49 @@
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
public class ConfigurablePageProcessor implements PageProcessor {
private Site site;
private List<ExtractRule> extractRules;
public ConfigurablePageProcessor(Site site, List<ExtractRule> extractRules) {
this.site = site;
this.extractRules = extractRules;
}
@Override
public void process(Page page) {
for (ExtractRule extractRule : extractRules) {
if (extractRule.isMulti()) {
List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
if (extractRule.isNotNull() && results.size() == 0) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), results);
}
} else {
String result = page.getHtml().selectDocument(extractRule.getSelector());
if (extractRule.isNotNull() && result == null) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), result);
}
}
}
}
@Override
public Site getSite() {
return site;
}
}

View File

@ -0,0 +1,11 @@
package us.codecraft.webmagic.configurable;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public enum ExpressionType {
XPath, Regex, Css, JsonPath;
}

View File

@ -0,0 +1,113 @@
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.selector.Selector;
import static us.codecraft.webmagic.selector.Selectors.*;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class ExtractRule {
private String fieldName;
private ExpressionType expressionType;
private String expressionValue;
private String[] expressionParams;
private boolean multi = false;
private volatile Selector selector;
private boolean notNull = false;
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public ExpressionType getExpressionType() {
return expressionType;
}
public void setExpressionType(ExpressionType expressionType) {
this.expressionType = expressionType;
}
public String getExpressionValue() {
return expressionValue;
}
public void setExpressionValue(String expressionValue) {
this.expressionValue = expressionValue;
}
public String[] getExpressionParams() {
return expressionParams;
}
public void setExpressionParams(String[] expressionParams) {
this.expressionParams = expressionParams;
}
public boolean isMulti() {
return multi;
}
public void setMulti(boolean multi) {
this.multi = multi;
}
public Selector getSelector() {
if (selector == null) {
synchronized (this) {
if (selector == null) {
selector = compileSelector();
}
}
}
return selector;
}
private Selector compileSelector() {
switch (expressionType) {
case Css:
if (expressionParams.length >= 1) {
return $(expressionValue, expressionParams[0]);
} else {
return $(expressionValue);
}
case XPath:
return xpath(expressionValue);
case Regex:
if (expressionParams.length >= 1) {
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
} else {
return regex(expressionValue);
}
case JsonPath:
return new JsonPathSelector(expressionValue);
default:
return xpath(expressionValue);
}
}
public void setSelector(Selector selector) {
this.selector = selector;
}
public boolean isNotNull() {
return notNull;
}
public void setNotNull(boolean notNull) {
this.notNull = notNull;
}
}

View File

@ -1,15 +0,0 @@
package us.codecraft.webmagic.configurable;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface Inject {
String value() default "";
}

View File

@ -1,18 +0,0 @@
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.Map;
/**
* Inject property to object by {@link Inject} annotation.
*
* @author yihua.huang@dianping.com
*/
public class PropertyLoader<T> {
public T load(T object, Map<String, String> properties) {
return object;
}
}

View File

@ -1,51 +0,0 @@
package us.codecraft.webmagic.example;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.configurable.Inject;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
*/
public class ConfigurableBlogPageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net");
@Inject("linkRegex")
private String linkRegex;
@Inject("titleXpath")
private String titleXpath;
@Inject("contentXpath")
private String contentXpath;
@Inject("tagsXpath")
private String tagsXpath;
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex(linkRegex).all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath(titleXpath).toString());
if (page.getResultItems().get("title") == null) {
//skip this page
page.setSkip(true);
}
page.putField("content", page.getHtml().smartContent().toString());
page.putField("tags", page.getHtml().xpath(tagsXpath).all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
}
}

View File

@ -0,0 +1,39 @@
package us.codecraft.webmagic.configurable;
import org.junit.Test;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.MockGithubDownloader;
import java.util.ArrayList;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class ConfigurablePageProcessorTest {
@Test
public void test() throws Exception {
List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
ExtractRule extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//title");
extractRule.setFieldName("title");
extractRules.add(extractRule);
extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
extractRule.setFieldName("star");
extractRules.add(extractRule);
ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
.setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic &middot; GitHub</title>");
assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
}
}