add xpath2.0 api
parent
5c96407a3d
commit
36494bcfa5
|
@ -7,25 +7,18 @@ import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 命令行输出抽取结果。可用于测试。<br>
|
* 命令行输出抽取结果。可用于测试。<br>
|
||||||
|
*
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* Date: 13-4-21
|
* Date: 13-4-21
|
||||||
* Time: 下午1:45
|
* Time: 下午1:45
|
||||||
*/
|
*/
|
||||||
public class ConsolePipeline implements Pipeline{
|
public class ConsolePipeline implements Pipeline {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems,Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
System.out.println("get page: " + resultItems.getRequest().getUrl());
|
||||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
if (entry.getValue() instanceof Iterable) {
|
System.out.println(entry.getKey() + ":\t" + entry.getValue());
|
||||||
Iterable value = (Iterable) entry.getValue();
|
|
||||||
System.out.println(entry.getKey() + ":");
|
|
||||||
for (Object o : value) {
|
|
||||||
System.out.println(o);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
System.out.println(entry.getKey() + ":\t" + entry.getValue());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,6 +63,12 @@ public class Html extends PlainText {
|
||||||
return selectList(xpathSelector, strings);
|
return selectList(xpathSelector, strings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable xpath2(String xpath) {
|
||||||
|
Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath);
|
||||||
|
return selectList(xpathSelector, strings);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
CssSelector cssSelector = new CssSelector(selector);
|
CssSelector cssSelector = new CssSelector(selector);
|
||||||
|
|
|
@ -34,6 +34,11 @@ public class PlainText implements Selectable {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable xpath2(String xpath) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable $(String selector) {
|
public Selectable $(String selector) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
|
|
@ -18,6 +18,14 @@ public interface Selectable {
|
||||||
*/
|
*/
|
||||||
public Selectable xpath(String xpath);
|
public Selectable xpath(String xpath);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* select list with xpath 2.0 syntax
|
||||||
|
*
|
||||||
|
* @param xpath
|
||||||
|
* @return new Selectable after extract
|
||||||
|
*/
|
||||||
|
public Selectable xpath2(String xpath);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* select list with css selector
|
* select list with css selector
|
||||||
*
|
*
|
||||||
|
|
|
@ -34,6 +34,10 @@ public class SelectorFactory {
|
||||||
return newSelector(XpathSelector.class, xpath);
|
return newSelector(XpathSelector.class, xpath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Xpath2Selector newXpath2Selector(String xpath) {
|
||||||
|
return newSelector(Xpath2Selector.class, xpath);
|
||||||
|
}
|
||||||
|
|
||||||
public SmartContentSelector newSmartContentSelector(){
|
public SmartContentSelector newSmartContentSelector(){
|
||||||
return newSelector(SmartContentSelector.class);
|
return newSelector(SmartContentSelector.class);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
package us.codecraft.webmagic.schedular;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.schedular.Scheduler;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
|
@ -4,7 +4,6 @@ import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.model.ExtractBy;
|
import us.codecraft.webmagic.model.ExtractBy;
|
||||||
import us.codecraft.webmagic.model.OOSpider;
|
import us.codecraft.webmagic.model.OOSpider;
|
||||||
import us.codecraft.webmagic.model.TargetUrl;
|
import us.codecraft.webmagic.model.TargetUrl;
|
||||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
|
||||||
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -30,7 +29,7 @@ public class OschinaBlog implements Blog{
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new ConsolePipeline()).pipeline(new JsonFilePipeline()).run();
|
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitle() {
|
public String getTitle() {
|
||||||
|
|
|
@ -3,7 +3,7 @@ package us.codecraft.webmagic.samples;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
|
|
@ -5,7 +5,7 @@ import org.junit.Test;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||||
import us.codecraft.webmagic.samples.HuxiuProcessor;
|
import us.codecraft.webmagic.samples.HuxiuProcessor;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
|
import us.codecraft.webmagic.samples.DiandianBlogProcessor;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
import us.codecraft.webmagic.pipeline.FreemarkerPipeline;
|
||||||
import us.codecraft.webmagic.samples.SinaBlogProcesser;
|
import us.codecraft.webmagic.samples.SinaBlogProcesser;
|
||||||
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue