add xpath2.0 api
parent
5c96407a3d
commit
36494bcfa5
|
@ -7,25 +7,18 @@ import java.util.Map;
|
|||
|
||||
/**
|
||||
* 命令行输出抽取结果。可用于测试。<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:45
|
||||
* Date: 13-4-21
|
||||
* Time: 下午1:45
|
||||
*/
|
||||
public class ConsolePipeline implements Pipeline{
|
||||
public class ConsolePipeline implements Pipeline {
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems,Task task) {
|
||||
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
System.out.println("get page: " + resultItems.getRequest().getUrl());
|
||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||
if (entry.getValue() instanceof Iterable) {
|
||||
Iterable value = (Iterable) entry.getValue();
|
||||
System.out.println(entry.getKey() + ":");
|
||||
for (Object o : value) {
|
||||
System.out.println(o);
|
||||
}
|
||||
} else {
|
||||
System.out.println(entry.getKey() + ":\t" + entry.getValue());
|
||||
}
|
||||
System.out.println(entry.getKey() + ":\t" + entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,6 +63,12 @@ public class Html extends PlainText {
|
|||
return selectList(xpathSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable xpath2(String xpath) {
|
||||
Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath);
|
||||
return selectList(xpathSelector, strings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector) {
|
||||
CssSelector cssSelector = new CssSelector(selector);
|
||||
|
|
|
@ -34,6 +34,11 @@ public class PlainText implements Selectable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable xpath2(String xpath) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Selectable $(String selector) {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -18,6 +18,14 @@ public interface Selectable {
|
|||
*/
|
||||
public Selectable xpath(String xpath);
|
||||
|
||||
/**
|
||||
* select list with xpath 2.0 syntax
|
||||
*
|
||||
* @param xpath
|
||||
* @return new Selectable after extract
|
||||
*/
|
||||
public Selectable xpath2(String xpath);
|
||||
|
||||
/**
|
||||
* select list with css selector
|
||||
*
|
||||
|
|
|
@ -34,6 +34,10 @@ public class SelectorFactory {
|
|||
return newSelector(XpathSelector.class, xpath);
|
||||
}
|
||||
|
||||
public Xpath2Selector newXpath2Selector(String xpath) {
|
||||
return newSelector(Xpath2Selector.class, xpath);
|
||||
}
|
||||
|
||||
public SmartContentSelector newSmartContentSelector(){
|
||||
return newSelector(SmartContentSelector.class);
|
||||
}
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
package us.codecraft.webmagic.schedular;
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.schedular.Scheduler;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.LinkedHashSet;
|
|
@ -4,7 +4,6 @@ import us.codecraft.webmagic.Site;
|
|||
import us.codecraft.webmagic.model.ExtractBy;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.TargetUrl;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
||||
|
||||
/**
|
||||
|
@ -30,7 +29,7 @@ public class OschinaBlog implements Blog{
|
|||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new ConsolePipeline()).pipeline(new JsonFilePipeline()).run();
|
||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run();
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
|
|
|
@ -3,7 +3,7 @@ package us.codecraft.webmagic.samples;
|
|||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
|
|
|
@ -5,7 +5,7 @@ import org.junit.Test;
|
|||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
import us.codecraft.webmagic.samples.HuxiuProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
|
|||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
|
|||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
|
|||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||
import us.codecraft.webmagic.samples.SinaBlogProcesser;
|
||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
|
Loading…
Reference in New Issue