136 lines
6.3 KiB
Java
136 lines
6.3 KiB
Java
package com.mobai.webMagic;
|
|
|
|
import com.mobai.webMagic.util.ImageDownloaderUtil;
|
|
import org.apache.commons.lang3.ObjectUtils;
|
|
import org.apache.log4j.Logger;
|
|
import us.codecraft.webmagic.Page;
|
|
import us.codecraft.webmagic.Site;
|
|
import us.codecraft.webmagic.Spider;
|
|
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
|
import us.codecraft.webmagic.processor.PageProcessor;
|
|
import us.codecraft.webmagic.selector.Html;
|
|
import us.codecraft.webmagic.selector.Selectable;
|
|
|
|
import java.util.List;
|
|
|
|
public class OschinaBlogPageProcessor implements PageProcessor {
|
|
|
|
private static final Logger log = Logger.getLogger(OschinaBlogPageProcessor.class);
|
|
// private Site site = Site.me().setDomain("my.oschina.net");
|
|
private Site site = Site.me().setRetryTimes(3)
|
|
// .setSleepTime(24*60*60*1000);
|
|
.setSleepTime(1000);
|
|
|
|
@Override
|
|
public void process(Page page) {
|
|
Selectable url1 = page.getUrl();
|
|
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
|
|
page.getRequest().getUrl();
|
|
if (url1 == null) {
|
|
log.error("url为空");
|
|
return;
|
|
}
|
|
log.info(url1.get());
|
|
if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
|
|
// // 获取烟台南山学院的新闻Url
|
|
// List<String> newsUrls = page.getHtml()
|
|
// .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
|
// // 存在分页,将下一页url 添加到待采集列表
|
|
// Selectable nextPage = null;
|
|
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
|
|
// Selectable secondA = null;
|
|
// if ("首页".equals(firstA)){
|
|
// secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]/text()");
|
|
// }
|
|
// if ("下页".equals(firstA)) {
|
|
// nextPage = firstA;
|
|
// } else if ("下页".equals(secondA)) {
|
|
// nextPage = secondA;
|
|
// } else {
|
|
// System.out.println("没有下一页");
|
|
// }
|
|
// if (nextPage != null) {
|
|
// newsUrls.add(nextPage.links().get());
|
|
// Selectable finalNextPage = nextPage;
|
|
// process(new Page() {{
|
|
// setUrl(finalNextPage);
|
|
// }});
|
|
// }
|
|
// 添加
|
|
page.addTargetRequests(getNewsUrls(page));
|
|
page.setSkip(true);
|
|
} else {
|
|
String url = page.getUrl().toString();
|
|
String title = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/h1/text()").get();
|
|
// 日期等信息需分割 包括 来源
|
|
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
|
|
String date = newsHead.substring(newsHead.indexOf("发布时间")+5, newsHead.indexOf("发布时间") + 15);
|
|
String source = newsHead.substring(newsHead.indexOf("来源")+3, newsHead.indexOf("点击次数")-2);
|
|
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
|
|
page.putField("url", url);
|
|
page.putField("title", title);
|
|
page.putField("date", date);
|
|
page.putField("source", source);
|
|
page.putField("content", content);
|
|
|
|
|
|
//获取图片对应的URL
|
|
List<String> urlList = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/div/p/img/@src").all();
|
|
// 下载每一张图片
|
|
for (int i = 0; i < urlList.size(); i++) {
|
|
// http://www.nanshan.edu.cn/__local/2/4D/93/76F03CB6EBE3BFB7018397B37D4_C03E3A0C_36A9C.png
|
|
String inputUrl = "http://www.nanshan.edu.cn" + urlList.get(i);
|
|
ImageDownloaderUtil.downLoadImage(inputUrl, "D:/workspace/web-magic-test/news/img/" + inputUrl.split("/")[7]);
|
|
}
|
|
}
|
|
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
|
page.addTargetRequests(links);
|
|
System.out.println("页面提取执行完毕");
|
|
}
|
|
|
|
public List<String> getNewsUrls(Page page) {
|
|
// 获取烟台南山学院的新闻Url
|
|
List<String> newsUrls = page.getHtml()
|
|
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
|
// 存在分页,将下一页url 添加到待采集列表
|
|
Selectable nextPage = null;
|
|
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
|
|
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]");
|
|
Selectable secondA = null;
|
|
if (firstA.get().contains("首页")) {
|
|
try {
|
|
secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]");
|
|
} catch (Exception e) {
|
|
log.error("没有下一页了");
|
|
}
|
|
}
|
|
if (firstA.get().contains("下页")) {
|
|
nextPage = firstA.links();
|
|
} else if (ObjectUtils.isNotEmpty(secondA) && secondA.get().contains("下页")) {
|
|
nextPage = secondA.links();
|
|
} else {
|
|
System.out.println("没有下一页");
|
|
}
|
|
if (nextPage != null) {
|
|
// Selectable finalNextPage = nextPage;
|
|
// newsUrls.addAll(getNewsUrls(new Page() {{
|
|
// setUrl(finalNextPage);
|
|
// setHtml(new Html("",finalNextPage.get()));
|
|
// }}));
|
|
newsUrls.add(nextPage.get());
|
|
}
|
|
return newsUrls;
|
|
}
|
|
|
|
@Override
|
|
public Site getSite() {
|
|
return site;
|
|
}
|
|
|
|
public static void main(String[] args) {
|
|
Spider.create(new OschinaBlogPageProcessor())
|
|
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
|
|
.addPipeline(new JsonFilePipeline("news/"))
|
|
.run();
|
|
}
|
|
} |