ownWebMagic/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java

136 lines
6.3 KiB
Java

package com.mobai.webMagic;
import com.mobai.webMagic.util.ImageDownloaderUtil;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
public class OschinaBlogPageProcessor implements PageProcessor {
private static final Logger log = Logger.getLogger(OschinaBlogPageProcessor.class);
// private Site site = Site.me().setDomain("my.oschina.net");
private Site site = Site.me().setRetryTimes(3)
// .setSleepTime(24*60*60*1000);
.setSleepTime(1000);
@Override
public void process(Page page) {
Selectable url1 = page.getUrl();
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
page.getRequest().getUrl();
if (url1 == null) {
log.error("url为空");
return;
}
log.info(url1.get());
if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
// // 获取烟台南山学院的新闻Url
// List<String> newsUrls = page.getHtml()
// .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// // 存在分页,将下一页url 添加到待采集列表
// Selectable nextPage = null;
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
// Selectable secondA = null;
// if ("首页".equals(firstA)){
// secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]/text()");
// }
// if ("下页".equals(firstA)) {
// nextPage = firstA;
// } else if ("下页".equals(secondA)) {
// nextPage = secondA;
// } else {
// System.out.println("没有下一页");
// }
// if (nextPage != null) {
// newsUrls.add(nextPage.links().get());
// Selectable finalNextPage = nextPage;
// process(new Page() {{
// setUrl(finalNextPage);
// }});
// }
// 添加
page.addTargetRequests(getNewsUrls(page));
page.setSkip(true);
} else {
String url = page.getUrl().toString();
String title = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/h1/text()").get();
// 日期等信息需分割 包括 来源
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
String date = newsHead.substring(newsHead.indexOf("发布时间")+5, newsHead.indexOf("发布时间") + 15);
String source = newsHead.substring(newsHead.indexOf("来源")+3, newsHead.indexOf("点击次数")-2);
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
page.putField("url", url);
page.putField("title", title);
page.putField("date", date);
page.putField("source", source);
page.putField("content", content);
//获取图片对应的URL
List<String> urlList = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/div/p/img/@src").all();
// 下载每一张图片
for (int i = 0; i < urlList.size(); i++) {
// http://www.nanshan.edu.cn/__local/2/4D/93/76F03CB6EBE3BFB7018397B37D4_C03E3A0C_36A9C.png
String inputUrl = "http://www.nanshan.edu.cn" + urlList.get(i);
ImageDownloaderUtil.downLoadImage(inputUrl, "D:/workspace/web-magic-test/news/img/" + inputUrl.split("/")[7]);
}
}
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
System.out.println("页面提取执行完毕");
}
public List<String> getNewsUrls(Page page) {
// 获取烟台南山学院的新闻Url
List<String> newsUrls = page.getHtml()
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// 存在分页,将下一页url 添加到待采集列表
Selectable nextPage = null;
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]");
Selectable secondA = null;
if (firstA.get().contains("首页")) {
try {
secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]");
} catch (Exception e) {
log.error("没有下一页了");
}
}
if (firstA.get().contains("下页")) {
nextPage = firstA.links();
} else if (ObjectUtils.isNotEmpty(secondA) && secondA.get().contains("下页")) {
nextPage = secondA.links();
} else {
System.out.println("没有下一页");
}
if (nextPage != null) {
// Selectable finalNextPage = nextPage;
// newsUrls.addAll(getNewsUrls(new Page() {{
// setUrl(finalNextPage);
// setHtml(new Html("",finalNextPage.get()));
// }}));
newsUrls.add(nextPage.get());
}
return newsUrls;
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcessor())
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
.addPipeline(new JsonFilePipeline("news/"))
.run();
}
}