commit
cc9d319fd9
|
@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.handler.PatternHandler;
|
import us.codecraft.webmagic.handler.PatternHandler;
|
||||||
|
import us.codecraft.webmagic.handler.SubPageProcessor;
|
||||||
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
||||||
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
||||||
|
|
||||||
|
@ -32,21 +33,23 @@ public class PatternProcessorDemo {
|
||||||
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
|
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onExtract(Page page) {
|
public SubPageProcessor.MatchOtherProcessor process(Page page) {
|
||||||
|
|
||||||
log.info("Extracting from " + page.getUrl());
|
log.info("Extracting from " + page.getUrl());
|
||||||
page.putField("test", "hello world:)");
|
page.putField("test", "hello world:)");
|
||||||
|
return MatchOtherProcessor.YES;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onHandle(ResultItems result, Task task) {
|
public void handle(ResultItems result, Task task) {
|
||||||
|
|
||||||
log.info("Handling " + result.getRequest().getUrl());
|
log.info("Handling " + result.getRequest().getUrl());
|
||||||
log.info("Retrieved test=" + result.get("test"));
|
log.info("Retrieved test=" + result.get("test"));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
handler.register(processor, pipeline);
|
processor.addHandler(handler);
|
||||||
|
pipeline.addHandler(handler);
|
||||||
|
|
||||||
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
|
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.ResultItems;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
|
||||||
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
|
||||||
|
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@ -17,7 +15,7 @@ import java.util.UUID;
|
||||||
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
||||||
* its two abstract methods.
|
* its two abstract methods.
|
||||||
*/
|
*/
|
||||||
public abstract class PatternHandler {
|
public abstract class PatternHandler implements SubPageProcessor {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* identity of the handler.
|
* identity of the handler.
|
||||||
|
@ -47,46 +45,25 @@ public abstract class PatternHandler {
|
||||||
return url.matches(pattern);
|
return url.matches(pattern);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public boolean processPage(Page page) {
|
||||||
* registers to both the page processor and the pipeline so the handler could take charge of
|
|
||||||
* both end of procedure.
|
|
||||||
*
|
|
||||||
* @param processor
|
|
||||||
* the processor to handle
|
|
||||||
* @param pipeline
|
|
||||||
* the pipeline to handle
|
|
||||||
*/
|
|
||||||
public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
|
|
||||||
|
|
||||||
processor.addHandler(this);
|
|
||||||
pipeline.addHandler(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
|
|
||||||
|
|
||||||
processor.removeHandler(this);
|
|
||||||
pipeline.removeHandler(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean process(Page page) {
|
|
||||||
|
|
||||||
if(match(page.getUrl().toString())) {
|
if(match(page.getUrl().toString())) {
|
||||||
page.putField(id, true);
|
page.putField(id, true);
|
||||||
onExtract(page);
|
process(page);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean process(ResultItems resultItems, Task task) {
|
public boolean processResult(ResultItems resultItems, Task task) {
|
||||||
|
|
||||||
if(resultItems.isSkip()) {
|
if(resultItems.isSkip()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
|
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
|
||||||
onHandle(resultItems, task);
|
handle(resultItems, task);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
@ -94,20 +71,20 @@ public abstract class PatternHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* implements this method to extract from page.
|
* override this method to handle the extraction result. this method MUST use
|
||||||
*
|
* with PatternPipeline
|
||||||
* @param page
|
|
||||||
* the page to extract
|
|
||||||
*/
|
|
||||||
public abstract void onExtract(Page page);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* implements this method to handle the extraction result.
|
|
||||||
*
|
*
|
||||||
* @param result
|
* @param result
|
||||||
* extraction result
|
* extraction result
|
||||||
* @param task
|
* @param task
|
||||||
*/
|
*/
|
||||||
public abstract void onHandle(ResultItems result, Task task);
|
public void handle(ResultItems result, Task task) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean match(Page page) {
|
||||||
|
|
||||||
|
return match(page.getUrl().toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,26 +8,27 @@ import us.codecraft.webmagic.Page;
|
||||||
*/
|
*/
|
||||||
public interface SubPageProcessor {
|
public interface SubPageProcessor {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check whether the SubPageProcessor can process the page.<br></br>
|
* Check whether the SubPageProcessor can process the page.<br></br>
|
||||||
* Please DO NOT change page status in this method.
|
* Please DO NOT change page status in this method.
|
||||||
*
|
*
|
||||||
* @param page
|
* @param page
|
||||||
* @return
|
*
|
||||||
*/
|
* @return
|
||||||
public boolean match(Page page);
|
*/
|
||||||
|
public boolean match(Page page);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* process the page, extract urls to fetch, extract the data and store
|
||||||
* process the page, extract urls to fetch, extract the data and store
|
*
|
||||||
*
|
* @param page
|
||||||
* @param page
|
*
|
||||||
* @return whether continue to match
|
* @return whether continue to match
|
||||||
*/
|
*/
|
||||||
public MatchOtherProcessor process(Page page);
|
public MatchOtherProcessor process(Page page);
|
||||||
|
|
||||||
public enum MatchOtherProcessor {
|
public enum MatchOtherProcessor {
|
||||||
YES, NO;
|
YES, NO
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline {
|
||||||
*
|
*
|
||||||
* @param handler the pattern handler
|
* @param handler the pattern handler
|
||||||
*
|
*
|
||||||
* @see PatternHandler#register
|
|
||||||
*/
|
*/
|
||||||
public void addHandler(PatternHandler handler) {
|
public void addHandler(PatternHandler handler) {
|
||||||
|
|
||||||
|
@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline {
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
|
|
||||||
for(PatternHandler handler : handlers) {
|
for(PatternHandler handler : handlers) {
|
||||||
handler.process(resultItems, task);
|
handler.processResult(resultItems, task);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor {
|
||||||
page.addTargetRequests(requests);
|
page.addTargetRequests(requests);
|
||||||
for(PatternHandler handler : handlers) {
|
for(PatternHandler handler : handlers) {
|
||||||
if(handler.match(page.getUrl().toString())) {
|
if(handler.match(page.getUrl().toString())) {
|
||||||
handler.process(page);
|
handler.processPage(page);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A handler works only if it is added to BOTH the page processor and the pipeline.
|
|
||||||
* Uses PatternHandler's register instead.
|
|
||||||
*
|
*
|
||||||
* @param handler the pattern handler
|
* @param handler the pattern handler
|
||||||
*
|
*
|
||||||
* @see PatternHandler#register
|
*
|
||||||
*/
|
*/
|
||||||
public void addHandler(PatternHandler handler) {
|
public void addHandler(PatternHandler handler) {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue