package com.mobai.webMagic; import com.mobai.webMagic.util.ImageDownloaderUtil; import org.apache.commons.lang3.ObjectUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import java.util.List; public class OschinaBlogPageProcessor implements PageProcessor { private static final Logger log = Logger.getLogger(OschinaBlogPageProcessor.class); // private Site site = Site.me().setDomain("my.oschina.net"); private Site site = Site.me().setRetryTimes(3) // .setSleepTime(24*60*60*1000); .setSleepTime(1000); @Override public void process(Page page) { Selectable url1 = page.getUrl(); // Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*"); page.getRequest().getUrl(); if (url1 == null) { log.error("url为空"); return; } log.info(url1.get()); if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) { // // 获取烟台南山学院的新闻Url // List newsUrls = page.getHtml() // .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all(); // // 存在分页,将下一页url 添加到待采集列表 // Selectable nextPage = null; // Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()"); // Selectable secondA = null; // if ("首页".equals(firstA)){ // secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]/text()"); // } // if ("下页".equals(firstA)) { // nextPage = firstA; // } else if ("下页".equals(secondA)) { // nextPage = secondA; // } else { // System.out.println("没有下一页"); // } // if (nextPage != null) { // newsUrls.add(nextPage.links().get()); // Selectable finalNextPage = nextPage; // process(new Page() {{ // setUrl(finalNextPage); // }}); // } // 添加 page.addTargetRequests(getNewsUrls(page)); page.setSkip(true); } else { String url = page.getUrl().toString(); String title = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/h1/text()").get(); // 日期等信息需分割 包括 来源 String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get(); String date = newsHead.substring(newsHead.indexOf("发布时间")+5, newsHead.indexOf("发布时间") + 15); String source = newsHead.substring(newsHead.indexOf("来源")+3, newsHead.indexOf("点击次数")-2); String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get(); page.putField("url", url); page.putField("title", title); page.putField("date", date); page.putField("source", source); page.putField("content", content); //获取图片对应的URL List urlList = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/div/p/img/@src").all(); // 下载每一张图片 for (int i = 0; i < urlList.size(); i++) { // http://www.nanshan.edu.cn/__local/2/4D/93/76F03CB6EBE3BFB7018397B37D4_C03E3A0C_36A9C.png String inputUrl = "http://www.nanshan.edu.cn" + urlList.get(i); ImageDownloaderUtil.downLoadImage(inputUrl, "D:/workspace/web-magic-test/news/img/" + inputUrl.split("/")[7]); } } List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); System.out.println("页面提取执行完毕"); } public List getNewsUrls(Page page) { // 获取烟台南山学院的新闻Url List newsUrls = page.getHtml() .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all(); // 存在分页,将下一页url 添加到待采集列表 Selectable nextPage = null; // Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()"); Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]"); Selectable secondA = null; if (firstA.get().contains("首页")) { try { secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]"); } catch (Exception e) { log.error("没有下一页了"); } } if (firstA.get().contains("下页")) { nextPage = firstA.links(); } else if (ObjectUtils.isNotEmpty(secondA) && secondA.get().contains("下页")) { nextPage = secondA.links(); } else { System.out.println("没有下一页"); } if (nextPage != null) { // Selectable finalNextPage = nextPage; // newsUrls.addAll(getNewsUrls(new Page() {{ // setUrl(finalNextPage); // setHtml(new Html("",finalNextPage.get())); // }})); newsUrls.add(nextPage.get()); } return newsUrls; } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new OschinaBlogPageProcessor()) .addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm") .addPipeline(new JsonFilePipeline("news/")) .run(); } }