add huaban processor
parent
fe224cbf66
commit
42508af041
|
@ -15,6 +15,9 @@ public class ConsolePipeline implements Pipeline{
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(ResultItems resultItems,Task task) {
|
public void process(ResultItems resultItems,Task task) {
|
||||||
|
if (resultItems.isSkip()){
|
||||||
|
return;
|
||||||
|
}
|
||||||
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
System.out.println("get page: "+resultItems.getRequest().getUrl());
|
||||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
System.out.println(entry.getKey()+":\t"+entry.getValue());
|
System.out.println(entry.getKey()+":\t"+entry.getValue());
|
||||||
|
|
|
@ -45,6 +45,9 @@ public class FilePipeline implements Pipeline {
|
||||||
if (!file.exists()) {
|
if (!file.exists()) {
|
||||||
file.mkdirs();
|
file.mkdirs();
|
||||||
}
|
}
|
||||||
|
if (resultItems.isSkip()){
|
||||||
|
return;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html"));
|
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html"));
|
||||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||||
|
|
|
@ -53,6 +53,7 @@ public class SeleniumDownloader implements Downloader,Destroyable {
|
||||||
logger.warn("interrupted", e);
|
logger.warn("interrupted", e);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
logger.info("downloading page " + request.getUrl());
|
||||||
webDriver.get(request.getUrl());
|
webDriver.get(request.getUrl());
|
||||||
WebDriver.Options manage = webDriver.manage();
|
WebDriver.Options manage = webDriver.manage();
|
||||||
Site site = task.getSite();
|
Site site = task.getSite();
|
||||||
|
|
|
@ -7,8 +7,6 @@ import org.openqa.selenium.WebDriver;
|
||||||
import org.openqa.selenium.WebElement;
|
import org.openqa.selenium.WebElement;
|
||||||
import org.openqa.selenium.chrome.ChromeDriver;
|
import org.openqa.selenium.chrome.ChromeDriver;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yihua.huang@dianping.com <br>
|
* @author yihua.huang@dianping.com <br>
|
||||||
* @date: 13-7-26 <br>
|
* @date: 13-7-26 <br>
|
||||||
|
@ -18,14 +16,12 @@ public class SeleniumTest {
|
||||||
|
|
||||||
@Ignore("need chrome driver")
|
@Ignore("need chrome driver")
|
||||||
@Test
|
@Test
|
||||||
public void test(){
|
public void testSelenium() {
|
||||||
System.getProperties().setProperty("webdriver.chrome.driver","/Users/yihua/Downloads/chromedriver");
|
System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver");
|
||||||
WebDriver webDriver = new ChromeDriver();
|
WebDriver webDriver = new ChromeDriver();
|
||||||
webDriver.get("http://huaban.com/");
|
webDriver.get("http://huaban.com/");
|
||||||
List<WebElement> elements = webDriver.findElements(By.xpath("/html"));
|
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
||||||
for (WebElement element : elements) {
|
System.out.println(webElement.getAttribute("outerHTML"));
|
||||||
System.out.println(element.getAttribute("outerHTML"));
|
|
||||||
}
|
|
||||||
webDriver.close();
|
webDriver.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,11 @@
|
||||||
<artifactId>webmagic-misc</artifactId>
|
<artifactId>webmagic-misc</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>us.codecraft</groupId>
|
||||||
|
<artifactId>webmagic-selenium</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>junit</groupId>
|
<groupId>junit</groupId>
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>junit</artifactId>
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||||
|
import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author yihua.huang@dianping.com <br>
|
||||||
|
* @date: 13-7-26 <br>
|
||||||
|
* Time: 下午4:08 <br>
|
||||||
|
*/
|
||||||
|
public class HuabanProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
|
||||||
|
if (page.getUrl().toString().contains("pins")) {
|
||||||
|
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString());
|
||||||
|
} else {
|
||||||
|
page.getResultItems().setSkip(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
if (site == null) {
|
||||||
|
site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/");
|
||||||
|
}
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new HuabanProcessor())
|
||||||
|
.scheduler(new RedisScheduler("localhost"))
|
||||||
|
.pipeline(new FilePipeline("/data/webmagic/test/"))
|
||||||
|
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
|
||||||
|
.runAsync();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue