ConfigurablePageProcessor #91
parent
1090d070d9
commit
9b2cb43f47
|
@ -131,6 +131,7 @@ public class Html extends PlainText {
|
|||
}
|
||||
|
||||
public Document getDocument() {
|
||||
initDocument();
|
||||
return document;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
package us.codecraft.webmagic.configurable;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
public class ConfigurablePageProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
private List<ExtractRule> extractRules;
|
||||
|
||||
public ConfigurablePageProcessor(Site site, List<ExtractRule> extractRules) {
|
||||
this.site = site;
|
||||
this.extractRules = extractRules;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
for (ExtractRule extractRule : extractRules) {
|
||||
if (extractRule.isMulti()) {
|
||||
List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
|
||||
if (extractRule.isNotNull() && results.size() == 0) {
|
||||
page.setSkip(true);
|
||||
} else {
|
||||
page.getResultItems().put(extractRule.getFieldName(), results);
|
||||
}
|
||||
} else {
|
||||
String result = page.getHtml().selectDocument(extractRule.getSelector());
|
||||
if (extractRule.isNotNull() && result == null) {
|
||||
page.setSkip(true);
|
||||
} else {
|
||||
page.getResultItems().put(extractRule.getFieldName(), result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package us.codecraft.webmagic.configurable;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public enum ExpressionType {
|
||||
|
||||
XPath, Regex, Css, JsonPath;
|
||||
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package us.codecraft.webmagic.configurable;
|
||||
|
||||
import us.codecraft.webmagic.selector.JsonPathSelector;
|
||||
import us.codecraft.webmagic.selector.Selector;
|
||||
|
||||
import static us.codecraft.webmagic.selector.Selectors.*;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public class ExtractRule {
|
||||
|
||||
private String fieldName;
|
||||
|
||||
private ExpressionType expressionType;
|
||||
|
||||
private String expressionValue;
|
||||
|
||||
private String[] expressionParams;
|
||||
|
||||
private boolean multi = false;
|
||||
|
||||
private volatile Selector selector;
|
||||
|
||||
private boolean notNull = false;
|
||||
|
||||
public String getFieldName() {
|
||||
return fieldName;
|
||||
}
|
||||
|
||||
public void setFieldName(String fieldName) {
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
public ExpressionType getExpressionType() {
|
||||
return expressionType;
|
||||
}
|
||||
|
||||
public void setExpressionType(ExpressionType expressionType) {
|
||||
this.expressionType = expressionType;
|
||||
}
|
||||
|
||||
public String getExpressionValue() {
|
||||
return expressionValue;
|
||||
}
|
||||
|
||||
public void setExpressionValue(String expressionValue) {
|
||||
this.expressionValue = expressionValue;
|
||||
}
|
||||
|
||||
public String[] getExpressionParams() {
|
||||
return expressionParams;
|
||||
}
|
||||
|
||||
public void setExpressionParams(String[] expressionParams) {
|
||||
this.expressionParams = expressionParams;
|
||||
}
|
||||
|
||||
public boolean isMulti() {
|
||||
return multi;
|
||||
}
|
||||
|
||||
public void setMulti(boolean multi) {
|
||||
this.multi = multi;
|
||||
}
|
||||
|
||||
public Selector getSelector() {
|
||||
if (selector == null) {
|
||||
synchronized (this) {
|
||||
if (selector == null) {
|
||||
selector = compileSelector();
|
||||
}
|
||||
}
|
||||
}
|
||||
return selector;
|
||||
}
|
||||
|
||||
private Selector compileSelector() {
|
||||
switch (expressionType) {
|
||||
case Css:
|
||||
if (expressionParams.length >= 1) {
|
||||
return $(expressionValue, expressionParams[0]);
|
||||
} else {
|
||||
return $(expressionValue);
|
||||
}
|
||||
case XPath:
|
||||
return xpath(expressionValue);
|
||||
case Regex:
|
||||
if (expressionParams.length >= 1) {
|
||||
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
|
||||
} else {
|
||||
return regex(expressionValue);
|
||||
}
|
||||
case JsonPath:
|
||||
return new JsonPathSelector(expressionValue);
|
||||
default:
|
||||
return xpath(expressionValue);
|
||||
}
|
||||
}
|
||||
|
||||
public void setSelector(Selector selector) {
|
||||
this.selector = selector;
|
||||
}
|
||||
|
||||
public boolean isNotNull() {
|
||||
return notNull;
|
||||
}
|
||||
|
||||
public void setNotNull(boolean notNull) {
|
||||
this.notNull = notNull;
|
||||
}
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
package us.codecraft.webmagic.configurable;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com
|
||||
*/
|
||||
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
|
||||
@Target({ElementType.FIELD})
|
||||
public @interface Inject {
|
||||
|
||||
String value() default "";
|
||||
}
|
|
@ -1,18 +0,0 @@
|
|||
package us.codecraft.webmagic.configurable;
|
||||
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Inject property to object by {@link Inject} annotation.
|
||||
*
|
||||
* @author yihua.huang@dianping.com
|
||||
*/
|
||||
public class PropertyLoader<T> {
|
||||
|
||||
public T load(T object, Map<String, String> properties) {
|
||||
return object;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,51 +0,0 @@
|
|||
package us.codecraft.webmagic.example;
|
||||
|
||||
import java.util.List;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.configurable.Inject;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
public class ConfigurableBlogPageProcessor implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setDomain("my.oschina.net");
|
||||
|
||||
@Inject("linkRegex")
|
||||
private String linkRegex;
|
||||
|
||||
@Inject("titleXpath")
|
||||
private String titleXpath;
|
||||
|
||||
@Inject("contentXpath")
|
||||
private String contentXpath;
|
||||
|
||||
@Inject("tagsXpath")
|
||||
private String tagsXpath;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
List<String> links = page.getHtml().links().regex(linkRegex).all();
|
||||
page.addTargetRequests(links);
|
||||
page.putField("title", page.getHtml().xpath(titleXpath).toString());
|
||||
if (page.getResultItems().get("title") == null) {
|
||||
//skip this page
|
||||
page.setSkip(true);
|
||||
}
|
||||
page.putField("content", page.getHtml().smartContent().toString());
|
||||
page.putField("tags", page.getHtml().xpath(tagsXpath).all());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new ConfigurableBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package us.codecraft.webmagic.configurable;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.downloader.MockGithubDownloader;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @date 14-4-5
|
||||
*/
|
||||
public class ConfigurablePageProcessorTest {
|
||||
|
||||
@Test
|
||||
public void test() throws Exception {
|
||||
List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
|
||||
ExtractRule extractRule = new ExtractRule();
|
||||
extractRule.setExpressionType(ExpressionType.XPath);
|
||||
extractRule.setExpressionValue("//title");
|
||||
extractRule.setFieldName("title");
|
||||
extractRules.add(extractRule);
|
||||
extractRule = new ExtractRule();
|
||||
extractRule.setExpressionType(ExpressionType.XPath);
|
||||
extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
|
||||
extractRule.setFieldName("star");
|
||||
extractRules.add(extractRule);
|
||||
ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
|
||||
.setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
|
||||
assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
|
||||
assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue