commit
cc9d319fd9
|
@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems;
|
|||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.handler.PatternHandler;
|
||||
import us.codecraft.webmagic.handler.SubPageProcessor;
|
||||
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
||||
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
||||
|
||||
|
@ -32,21 +33,23 @@ public class PatternProcessorDemo {
|
|||
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
|
||||
|
||||
@Override
|
||||
public void onExtract(Page page) {
|
||||
public SubPageProcessor.MatchOtherProcessor process(Page page) {
|
||||
|
||||
log.info("Extracting from " + page.getUrl());
|
||||
page.putField("test", "hello world:)");
|
||||
return MatchOtherProcessor.YES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onHandle(ResultItems result, Task task) {
|
||||
public void handle(ResultItems result, Task task) {
|
||||
|
||||
log.info("Handling " + result.getRequest().getUrl());
|
||||
log.info("Retrieved test=" + result.get("test"));
|
||||
}
|
||||
};
|
||||
|
||||
handler.register(processor, pipeline);
|
||||
processor.addHandler(handler);
|
||||
pipeline.addHandler(handler);
|
||||
|
||||
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
|
||||
}
|
||||
|
|
|
@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler;
|
|||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
||||
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
|
@ -17,7 +15,7 @@ import java.util.UUID;
|
|||
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
||||
* its two abstract methods.
|
||||
*/
|
||||
public abstract class PatternHandler {
|
||||
public abstract class PatternHandler implements SubPageProcessor {
|
||||
|
||||
/**
|
||||
* identity of the handler.
|
||||
|
@ -47,46 +45,25 @@ public abstract class PatternHandler {
|
|||
return url.matches(pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* registers to both the page processor and the pipeline so the handler could take charge of
|
||||
* both end of procedure.
|
||||
*
|
||||
* @param processor
|
||||
* the processor to handle
|
||||
* @param pipeline
|
||||
* the pipeline to handle
|
||||
*/
|
||||
public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
|
||||
|
||||
processor.addHandler(this);
|
||||
pipeline.addHandler(this);
|
||||
}
|
||||
|
||||
public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
|
||||
|
||||
processor.removeHandler(this);
|
||||
pipeline.removeHandler(this);
|
||||
}
|
||||
|
||||
public boolean process(Page page) {
|
||||
public boolean processPage(Page page) {
|
||||
|
||||
if(match(page.getUrl().toString())) {
|
||||
page.putField(id, true);
|
||||
onExtract(page);
|
||||
process(page);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean process(ResultItems resultItems, Task task) {
|
||||
public boolean processResult(ResultItems resultItems, Task task) {
|
||||
|
||||
if(resultItems.isSkip()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
|
||||
onHandle(resultItems, task);
|
||||
handle(resultItems, task);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
@ -94,20 +71,20 @@ public abstract class PatternHandler {
|
|||
}
|
||||
|
||||
/**
|
||||
* implements this method to extract from page.
|
||||
*
|
||||
* @param page
|
||||
* the page to extract
|
||||
*/
|
||||
public abstract void onExtract(Page page);
|
||||
|
||||
/**
|
||||
* implements this method to handle the extraction result.
|
||||
* override this method to handle the extraction result. this method MUST use
|
||||
* with PatternPipeline
|
||||
*
|
||||
* @param result
|
||||
* extraction result
|
||||
* @param task
|
||||
*/
|
||||
public abstract void onHandle(ResultItems result, Task task);
|
||||
public void handle(ResultItems result, Task task) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean match(Page page) {
|
||||
|
||||
return match(page.getUrl().toString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,21 +13,22 @@ public interface SubPageProcessor {
|
|||
* Please DO NOT change page status in this method.
|
||||
*
|
||||
* @param page
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean match(Page page);
|
||||
|
||||
/**
|
||||
*
|
||||
* process the page, extract urls to fetch, extract the data and store
|
||||
*
|
||||
* @param page
|
||||
*
|
||||
* @return whether continue to match
|
||||
*/
|
||||
public MatchOtherProcessor process(Page page);
|
||||
|
||||
public enum MatchOtherProcessor {
|
||||
YES, NO;
|
||||
YES, NO
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline {
|
|||
*
|
||||
* @param handler the pattern handler
|
||||
*
|
||||
* @see PatternHandler#register
|
||||
*/
|
||||
public void addHandler(PatternHandler handler) {
|
||||
|
||||
|
@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline {
|
|||
public void process(ResultItems resultItems, Task task) {
|
||||
|
||||
for(PatternHandler handler : handlers) {
|
||||
handler.process(resultItems, task);
|
||||
handler.processResult(resultItems, task);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor {
|
|||
page.addTargetRequests(requests);
|
||||
for(PatternHandler handler : handlers) {
|
||||
if(handler.match(page.getUrl().toString())) {
|
||||
handler.process(page);
|
||||
handler.processPage(page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A handler works only if it is added to BOTH the page processor and the pipeline.
|
||||
* Uses PatternHandler's register instead.
|
||||
*
|
||||
* @param handler the pattern handler
|
||||
*
|
||||
* @see PatternHandler#register
|
||||
*
|
||||
*/
|
||||
public void addHandler(PatternHandler handler) {
|
||||
|
||||
|
|
Loading…
Reference in New Issue