|
|
@ -20,25 +20,37 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public void process(Page page) {
|
|
|
|
public void process(Page page) {
|
|
|
|
// http://www.nanshan.edu.cn/info/1051/8951.htm
|
|
|
|
|
|
|
|
// http://www.nanshan.edu.cn/nyyw.htm
|
|
|
|
|
|
|
|
// http://www.nanshan.edu.cn/nyyw/123.htm
|
|
|
|
|
|
|
|
Selectable url1 = page.getUrl();
|
|
|
|
Selectable url1 = page.getUrl();
|
|
|
|
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
|
|
|
|
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
|
|
|
|
page.getRequest().getUrl();
|
|
|
|
page.getRequest().getUrl();
|
|
|
|
// if (!url1.match()) {
|
|
|
|
|
|
|
|
if (url1 == null) {
|
|
|
|
if (url1 == null) {
|
|
|
|
log.error("url为空");
|
|
|
|
log.error("url为空");
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
log.info(url1.get());
|
|
|
|
log.info(url1.get());
|
|
|
|
if (("http://www.nanshan.edu.cn/nyyw.*").matches(url1.get())) {
|
|
|
|
if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
|
|
|
|
// 获取烟台南山学院的新闻Url
|
|
|
|
// 获取烟台南山学院的新闻Url
|
|
|
|
List<String> newsUrls = page.getHtml()
|
|
|
|
List<String> newsUrls = page.getHtml()
|
|
|
|
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
|
|
|
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
|
|
|
// 存在分页,将下一页url 添加到待采集列表
|
|
|
|
// 存在分页,将下一页url 添加到待采集列表
|
|
|
|
String nextPage = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[2]/div/a[2]").links().get();
|
|
|
|
Selectable nextPage = null;
|
|
|
|
newsUrls.add(nextPage);
|
|
|
|
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
|
|
|
|
|
|
|
|
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ("下页".equals(firstA.links().get())) {
|
|
|
|
|
|
|
|
nextPage = firstA;
|
|
|
|
|
|
|
|
} else if ("下页".equals(secondA.links().get())) {
|
|
|
|
|
|
|
|
nextPage = secondA;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
System.out.println("没有下一页");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nextPage != null) {
|
|
|
|
|
|
|
|
newsUrls.add(nextPage.links().get());
|
|
|
|
|
|
|
|
Selectable finalNextPage = nextPage;
|
|
|
|
|
|
|
|
process(new Page() {{
|
|
|
|
|
|
|
|
setUrl(finalNextPage);
|
|
|
|
|
|
|
|
}});
|
|
|
|
|
|
|
|
}
|
|
|
|
// 添加
|
|
|
|
// 添加
|
|
|
|
page.addTargetRequests(newsUrls);
|
|
|
|
page.addTargetRequests(newsUrls);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
@ -48,7 +60,7 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
|
|
|
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
|
|
|
|
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
|
|
|
|
String[] split = newsHead.split(":");
|
|
|
|
String[] split = newsHead.split(":");
|
|
|
|
String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10);
|
|
|
|
String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10);
|
|
|
|
String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7);
|
|
|
|
String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7);
|
|
|
|
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
|
|
|
|
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
|
|
|
|
page.putField("url", url);
|
|
|
|
page.putField("url", url);
|
|
|
|
page.putField("title", title);
|
|
|
|
page.putField("title", title);
|
|
|
@ -59,7 +71,32 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
|
|
|
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
|
|
|
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
|
|
|
page.addTargetRequests(links);
|
|
|
|
page.addTargetRequests(links);
|
|
|
|
System.out.println("页面提取执行完毕");
|
|
|
|
System.out.println("页面提取执行完毕");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public List<String> getNewsUrls(Page page) {
|
|
|
|
|
|
|
|
// 获取烟台南山学院的新闻Url
|
|
|
|
|
|
|
|
List<String> newsUrls = page.getHtml()
|
|
|
|
|
|
|
|
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
|
|
|
|
|
|
|
// 存在分页,将下一页url 添加到待采集列表
|
|
|
|
|
|
|
|
Selectable nextPage = null;
|
|
|
|
|
|
|
|
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
|
|
|
|
|
|
|
|
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ("下页".equals(firstA.links().get())) {
|
|
|
|
|
|
|
|
nextPage = firstA;
|
|
|
|
|
|
|
|
} else if ("下页".equals(secondA.links().get())) {
|
|
|
|
|
|
|
|
nextPage = secondA;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
System.out.println("没有下一页");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nextPage != null) {
|
|
|
|
|
|
|
|
newsUrls.add(nextPage.links().get());
|
|
|
|
|
|
|
|
Selectable finalNextPage = nextPage;
|
|
|
|
|
|
|
|
process(new Page() {{
|
|
|
|
|
|
|
|
setUrl(finalNextPage);
|
|
|
|
|
|
|
|
}});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return newsUrls;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
@ -70,7 +107,7 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
|
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
public static void main(String[] args) {
|
|
|
|
Spider.create(new OschinaBlogPageProcessor())
|
|
|
|
Spider.create(new OschinaBlogPageProcessor())
|
|
|
|
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
|
|
|
|
.addUrl("http://www.nanshan.edu.cn/nyyw.htm")
|
|
|
|
.addPipeline(new JsonFilePipeline("news/"))
|
|
|
|
.addPipeline(new JsonFilePipeline("news/"))
|
|
|
|
.run();
|
|
|
|
.run();
|
|
|
|
}
|
|
|
|
}
|
|
|
|