feat():获取页面内部链接获取新闻
parent
80efdc800a
commit
39db1a4db8
|
@ -0,0 +1,6 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<profile version="1.0">
|
||||||
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="AliAccessStaticViaInstance" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||||
|
</profile>
|
||||||
|
</component>
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1 @@
|
||||||
|
{}
|
|
@ -20,25 +20,37 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(Page page) {
|
public void process(Page page) {
|
||||||
// http://www.nanshan.edu.cn/info/1051/8951.htm
|
|
||||||
// http://www.nanshan.edu.cn/nyyw.htm
|
|
||||||
// http://www.nanshan.edu.cn/nyyw/123.htm
|
|
||||||
Selectable url1 = page.getUrl();
|
Selectable url1 = page.getUrl();
|
||||||
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
|
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
|
||||||
page.getRequest().getUrl();
|
page.getRequest().getUrl();
|
||||||
// if (!url1.match()) {
|
|
||||||
if (url1 == null) {
|
if (url1 == null) {
|
||||||
log.error("url为空");
|
log.error("url为空");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
log.info(url1.get());
|
log.info(url1.get());
|
||||||
if (("http://www.nanshan.edu.cn/nyyw.*").matches(url1.get())) {
|
if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
|
||||||
// 获取烟台南山学院的新闻Url
|
// 获取烟台南山学院的新闻Url
|
||||||
List<String> newsUrls = page.getHtml()
|
List<String> newsUrls = page.getHtml()
|
||||||
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
||||||
// 存在分页,将下一页url 添加到待采集列表
|
// 存在分页,将下一页url 添加到待采集列表
|
||||||
String nextPage = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[2]/div/a[2]").links().get();
|
Selectable nextPage = null;
|
||||||
newsUrls.add(nextPage);
|
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
|
||||||
|
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
|
||||||
|
|
||||||
|
if ("下页".equals(firstA.links().get())) {
|
||||||
|
nextPage = firstA;
|
||||||
|
} else if ("下页".equals(secondA.links().get())) {
|
||||||
|
nextPage = secondA;
|
||||||
|
} else {
|
||||||
|
System.out.println("没有下一页");
|
||||||
|
}
|
||||||
|
if (nextPage != null) {
|
||||||
|
newsUrls.add(nextPage.links().get());
|
||||||
|
Selectable finalNextPage = nextPage;
|
||||||
|
process(new Page() {{
|
||||||
|
setUrl(finalNextPage);
|
||||||
|
}});
|
||||||
|
}
|
||||||
// 添加
|
// 添加
|
||||||
page.addTargetRequests(newsUrls);
|
page.addTargetRequests(newsUrls);
|
||||||
} else {
|
} else {
|
||||||
|
@ -59,7 +71,32 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
||||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
||||||
page.addTargetRequests(links);
|
page.addTargetRequests(links);
|
||||||
System.out.println("页面提取执行完毕");
|
System.out.println("页面提取执行完毕");
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getNewsUrls(Page page) {
|
||||||
|
// 获取烟台南山学院的新闻Url
|
||||||
|
List<String> newsUrls = page.getHtml()
|
||||||
|
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
||||||
|
// 存在分页,将下一页url 添加到待采集列表
|
||||||
|
Selectable nextPage = null;
|
||||||
|
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
|
||||||
|
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
|
||||||
|
|
||||||
|
if ("下页".equals(firstA.links().get())) {
|
||||||
|
nextPage = firstA;
|
||||||
|
} else if ("下页".equals(secondA.links().get())) {
|
||||||
|
nextPage = secondA;
|
||||||
|
} else {
|
||||||
|
System.out.println("没有下一页");
|
||||||
|
}
|
||||||
|
if (nextPage != null) {
|
||||||
|
newsUrls.add(nextPage.links().get());
|
||||||
|
Selectable finalNextPage = nextPage;
|
||||||
|
process(new Page() {{
|
||||||
|
setUrl(finalNextPage);
|
||||||
|
}});
|
||||||
|
}
|
||||||
|
return newsUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -70,7 +107,7 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
Spider.create(new OschinaBlogPageProcessor())
|
Spider.create(new OschinaBlogPageProcessor())
|
||||||
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
|
.addUrl("http://www.nanshan.edu.cn/nyyw.htm")
|
||||||
.addPipeline(new JsonFilePipeline("news/"))
|
.addPipeline(new JsonFilePipeline("news/"))
|
||||||
.run();
|
.run();
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ import java.io.OutputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 该方法使用HttpClient下载图片、PDF、压缩文件等
|
* 该方法使用HttpClient下载图片、PDF、压缩文件等
|
||||||
* @Author: m
|
* @Author: saisai
|
||||||
* @Date: 2023/7/25 11:44 下午
|
* @Date: 2023/7/25 11:44 下午
|
||||||
* @Version 1.0
|
* @Version 1.0
|
||||||
*/
|
*/
|
||||||
|
|
Loading…
Reference in New Issue