commit
474f785dab
|
@ -0,0 +1,53 @@
|
|||
package us.codecraft.webmagic.example;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.handler.PatternHandler;
|
||||
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
||||
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 04, 2014
|
||||
* Time: 21:23
|
||||
*/
|
||||
public class PatternProcessorDemo {
|
||||
|
||||
private static Logger log = Logger.getLogger(PatternProcessorDemo.class);
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
PatternPageProcessor processor
|
||||
= new PatternPageProcessor("http://item.jd.com/981821.html",
|
||||
PatternPageProcessor.TARGET_PATTERN_ALL
|
||||
);
|
||||
|
||||
PatternPipeline pipeline = new PatternPipeline();
|
||||
|
||||
// define a handler which handles only "http://item.jd.com/.*"
|
||||
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
|
||||
|
||||
@Override
|
||||
public void onExtract(Page page) {
|
||||
|
||||
log.info("Extracting from " + page.getUrl());
|
||||
page.putField("test", "hello world:)");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onHandle(ResultItems result, Task task) {
|
||||
|
||||
log.info("Handling " + result.getRequest().getUrl());
|
||||
log.info("Retrieved test=" + result.get("test"));
|
||||
}
|
||||
};
|
||||
|
||||
handler.register(processor, pipeline);
|
||||
|
||||
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package us.codecraft.webmagic.handler;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
||||
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 03, 2014
|
||||
* Time: 10:00
|
||||
* <p></p>
|
||||
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
||||
* its two abstract methods.
|
||||
*/
|
||||
public abstract class PatternHandler {
|
||||
|
||||
/**
|
||||
* identity of the handler.
|
||||
*/
|
||||
protected String id;
|
||||
|
||||
/**
|
||||
* match pattern. only matched page should be handled.
|
||||
*/
|
||||
protected String pattern;
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* url pattern to handle
|
||||
*/
|
||||
protected PatternHandler(String pattern) {
|
||||
|
||||
this.pattern = pattern;
|
||||
this.id = UUID.randomUUID().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* determine if the page should be handled.
|
||||
*/
|
||||
public boolean match(String url) {
|
||||
|
||||
return url.matches(pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* registers to both the page processor and the pipeline so the handler could take charge of
|
||||
* both end of procedure.
|
||||
*
|
||||
* @param processor
|
||||
* the processor to handle
|
||||
* @param pipeline
|
||||
* the pipeline to handle
|
||||
*/
|
||||
public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
|
||||
|
||||
processor.addHandler(this);
|
||||
pipeline.addHandler(this);
|
||||
}
|
||||
|
||||
public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
|
||||
|
||||
processor.removeHandler(this);
|
||||
pipeline.removeHandler(this);
|
||||
}
|
||||
|
||||
public boolean process(Page page) {
|
||||
|
||||
if(match(page.getUrl().toString())) {
|
||||
page.putField(id, true);
|
||||
onExtract(page);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean process(ResultItems resultItems, Task task) {
|
||||
|
||||
if(resultItems.isSkip()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
|
||||
onHandle(resultItems, task);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* implements this method to extract from page.
|
||||
*
|
||||
* @param page
|
||||
* the page to extract
|
||||
*/
|
||||
public abstract void onExtract(Page page);
|
||||
|
||||
/**
|
||||
* implements this method to handle the extraction result.
|
||||
*
|
||||
* @param result
|
||||
* extraction result
|
||||
* @param task
|
||||
*/
|
||||
public abstract void onHandle(ResultItems result, Task task);
|
||||
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.handler.PatternHandler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 04, 2014
|
||||
* Time: 20:44
|
||||
*/
|
||||
public class PatternPipeline implements Pipeline {
|
||||
|
||||
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
|
||||
|
||||
/**
|
||||
* A handler works only if it is added to BOTH the page processor and the pipeline.
|
||||
* Uses PatternHandler's register instead.
|
||||
*
|
||||
* @param handler the pattern handler
|
||||
*
|
||||
* @see PatternHandler#register
|
||||
*/
|
||||
public void addHandler(PatternHandler handler) {
|
||||
|
||||
handlers.add(handler);
|
||||
}
|
||||
|
||||
public void removeHandler(PatternHandler handler) {
|
||||
|
||||
handlers.remove(handler);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
|
||||
for(PatternHandler handler : handlers) {
|
||||
handler.process(resultItems, task);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.handler.PatternHandler;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 04, 2014
|
||||
* Time: 15:36
|
||||
* <p></p>
|
||||
* A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern.
|
||||
*
|
||||
* @see us.codecraft.webmagic.handler.PatternHandler
|
||||
*/
|
||||
public class PatternPageProcessor implements PageProcessor {
|
||||
|
||||
public static final String TARGET_PATTERN_ALL = "http://*";
|
||||
|
||||
protected Site site;
|
||||
|
||||
protected String targetPattern;
|
||||
|
||||
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
|
||||
|
||||
public PatternPageProcessor(String startUrl, String targetPattern) {
|
||||
|
||||
this.targetPattern = targetPattern;
|
||||
|
||||
this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl));
|
||||
this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*",
|
||||
"[^\"'#]*") + ")";
|
||||
|
||||
site.setUserAgent("Chrome/5.0.354.0");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
|
||||
|
||||
List<String> requests = page.getHtml().links().regex(targetPattern).all();
|
||||
page.addTargetRequests(requests);
|
||||
for(PatternHandler handler : handlers) {
|
||||
if(handler.match(page.getUrl().toString())) {
|
||||
handler.process(page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A handler works only if it is added to BOTH the page processor and the pipeline.
|
||||
* Uses PatternHandler's register instead.
|
||||
*
|
||||
* @param handler the pattern handler
|
||||
*
|
||||
* @see PatternHandler#register
|
||||
*/
|
||||
public void addHandler(PatternHandler handler) {
|
||||
|
||||
handlers.add(handler);
|
||||
}
|
||||
|
||||
public void removeHandler(PatternHandler handler) {
|
||||
|
||||
handlers.remove(handler);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
|
||||
return site;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue