diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 72c3bf3..8be5fab 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -15,6 +15,9 @@ public class ConsolePipeline implements Pipeline{ @Override public void process(ResultItems resultItems,Task task) { + if (resultItems.isSkip()){ + return; + } System.out.println("get page: "+resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { System.out.println(entry.getKey()+":\t"+entry.getValue()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 10d97a8..cbce832 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -45,6 +45,9 @@ public class FilePipeline implements Pipeline { if (!file.exists()) { file.mkdirs(); } + if (resultItems.isSkip()){ + return; + } try { PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 1ed8b4d..b4dd372 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -53,6 +53,7 @@ public class SeleniumDownloader implements Downloader,Destroyable { logger.warn("interrupted", e); return null; } + logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java index e9e1c7b..6cf50c3 100644 --- a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -7,8 +7,6 @@ import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; -import java.util.List; - /** * @author yihua.huang@dianping.com
* @date: 13-7-26
@@ -18,14 +16,12 @@ public class SeleniumTest { @Ignore("need chrome driver") @Test - public void test(){ - System.getProperties().setProperty("webdriver.chrome.driver","/Users/yihua/Downloads/chromedriver"); + public void testSelenium() { + System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); WebDriver webDriver = new ChromeDriver(); webDriver.get("http://huaban.com/"); - List elements = webDriver.findElements(By.xpath("/html")); - for (WebElement element : elements) { - System.out.println(element.getAttribute("outerHTML")); - } + WebElement webElement = webDriver.findElement(By.xpath("/html")); + System.out.println(webElement.getAttribute("outerHTML")); webDriver.close(); } } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index b845490..8a7e00c 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -22,6 +22,11 @@ webmagic-misc ${project.version} + + us.codecraft + webmagic-selenium + ${project.version} + junit junit diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java new file mode 100644 index 0000000..23434f3 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuabanProcessor.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; +import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-26
+ * Time: 下午4:08
+ */ +public class HuabanProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); + if (page.getUrl().toString().contains("pins")) { + page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString()); + } else { + page.getResultItems().setSkip(true); + } + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/"); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new HuabanProcessor()) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/webmagic/test/")) + .downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) + .runAsync(); + } +}