From 9606a173cd9f887778f0a60669912a06e2d4d87d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 13 Mar 2014 22:55:50 +0800 Subject: [PATCH] fix ZipCodePageProcessor --- .../webmagic/selector/PlainText.java | 7 ++- .../webmagic/selector/Selectable.java | 7 +++ .../scheduler/ZipCodePageProcessor.java | 44 +++++++++---------- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index bb1b868..9d5c385 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -109,7 +109,7 @@ public class PlainText implements Selectable { } @Override - public String toString() { + public String get() { if (CollectionUtils.isNotEmpty(all())) { return all().get(0); } else { @@ -117,6 +117,11 @@ public class PlainText implements Selectable { } } + @Override + public String toString() { + return get(); + } + @Override public boolean match() { return strings != null && strings.size() > 0; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 6b4410e..aa1bb62 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -99,6 +99,13 @@ public interface Selectable { */ public String toString(); + /** + * single string result + * + * @return single string result + */ + public String get(); + /** * if result exist for select * diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java index ddbaa08..3f2de70 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java @@ -9,8 +9,9 @@ import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.PriorityScheduler; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import static us.codecraft.webmagic.selector.Selectors.regex; import static us.codecraft.webmagic.selector.Selectors.xpath; /** @@ -19,16 +20,16 @@ import static us.codecraft.webmagic.selector.Selectors.xpath; public class ZipCodePageProcessor implements PageProcessor { private Site site = Site.me().setCharset("gb2312") - .setSleepTime(100).addStartUrl("http://www.ip138.com/post/"); + .setSleepTime(100); @Override public void process(Page page) { if (page.getUrl().toString().equals("http://www.ip138.com/post/")) { processCountry(page); - } else if (page.getUrl().regex("http://www\\.ip138\\.com/post/\\w+[/]?$").toString() != null) { - processProvince(page); - } else { + } else if (page.getUrl().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").toString() != null) { processDistrict(page); + } else { + processProvince(page); } } @@ -45,28 +46,26 @@ public class ZipCodePageProcessor implements PageProcessor { private void processProvince(Page page) { //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉 - List districts = page.getHtml().xpath("//body/table/tbody/tr/td").regex(".*http://www\\.ip138\\.com/post/\\w+/\\w+.*").all(); + List districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all(); + Pattern pattern = Pattern.compile("([^<>]+).*?href=\"(.*?)\"",Pattern.DOTALL); for (String district : districts) { - String link = xpath("//@href").select(district); - String title = xpath("/text()").select(district); - Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); - page.addTargetRequest(request); + Matcher matcher = pattern.matcher(district); + while (matcher.find()) { + String title = matcher.group(1); + String link = matcher.group(2); + Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); + page.addTargetRequest(request); + } } } private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); - List counties = page.getHtml().xpath("//body/table/tbody/tr").regex(".*\\d+.*").all(); - String regex = "]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)]*>([^<>]+)"; - for (String county : counties) { - String county0 = regex(regex, 1).select(county); - String county1 = regex(regex, 2).select(county); - String zipCode = regex(regex, 3).select(county); - page.putField("result", StringUtils.join(new String[]{province, district, - county0, county1, zipCode}, "\t")); - } - List links = page.getHtml().links().regex("http://www\\.ip138\\.com/post/\\w+/\\w+").all(); + String zipCode = page.getHtml().regex("

邮编:(\\d+)

").toString(); + page.putField("result", StringUtils.join(new String[]{province, district, + zipCode}, "\t")); + List links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } @@ -79,11 +78,8 @@ public class ZipCodePageProcessor implements PageProcessor { } public static void main(String[] args) { - Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).run(); + Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).addUrl("http://www.ip138.com/post/"); - PriorityScheduler scheduler = new PriorityScheduler(); - Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(scheduler); - scheduler.push(new Request("http://www.baidu.com/s?wd=webmagic&f=12&rsp=0&oq=webmagix&tn=baiduhome_pg&ie=utf-8"),spider); spider.run(); } }