feat():获取页面内部链接获取新闻

master
Saisai 2024-08-12 13:58:50 +08:00
parent 80efdc800a
commit 39db1a4db8
5 changed files with 60 additions and 10 deletions

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="AliAccessStaticViaInstance" enabled="true" level="WARNING" enabled_by_default="true" />
</profile>
</component>

6
.idea/vcs.xml 100644
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -20,25 +20,37 @@ public class OschinaBlogPageProcessor implements PageProcessor {
@Override @Override
public void process(Page page) { public void process(Page page) {
// http://www.nanshan.edu.cn/info/1051/8951.htm
// http://www.nanshan.edu.cn/nyyw.htm
// http://www.nanshan.edu.cn/nyyw/123.htm
Selectable url1 = page.getUrl(); Selectable url1 = page.getUrl();
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*"); // Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
page.getRequest().getUrl(); page.getRequest().getUrl();
// if (!url1.match()) {
if (url1 == null) { if (url1 == null) {
log.error("url为空"); log.error("url为空");
return; return;
} }
log.info(url1.get()); log.info(url1.get());
if (("http://www.nanshan.edu.cn/nyyw.*").matches(url1.get())) { if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
// 获取烟台南山学院的新闻Url // 获取烟台南山学院的新闻Url
List<String> newsUrls = page.getHtml() List<String> newsUrls = page.getHtml()
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all(); .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// 存在分页,将下一页url 添加到待采集列表 // 存在分页,将下一页url 添加到待采集列表
String nextPage = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[2]/div/a[2]").links().get(); Selectable nextPage = null;
newsUrls.add(nextPage); Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
if ("下页".equals(firstA.links().get())) {
nextPage = firstA;
} else if ("下页".equals(secondA.links().get())) {
nextPage = secondA;
} else {
System.out.println("没有下一页");
}
if (nextPage != null) {
newsUrls.add(nextPage.links().get());
Selectable finalNextPage = nextPage;
process(new Page() {{
setUrl(finalNextPage);
}});
}
// 添加 // 添加
page.addTargetRequests(newsUrls); page.addTargetRequests(newsUrls);
} else { } else {
@ -59,7 +71,32 @@ public class OschinaBlogPageProcessor implements PageProcessor {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links); page.addTargetRequests(links);
System.out.println("页面提取执行完毕"); System.out.println("页面提取执行完毕");
}
public List<String> getNewsUrls(Page page) {
// 获取烟台南山学院的新闻Url
List<String> newsUrls = page.getHtml()
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// 存在分页,将下一页url 添加到待采集列表
Selectable nextPage = null;
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
if ("下页".equals(firstA.links().get())) {
nextPage = firstA;
} else if ("下页".equals(secondA.links().get())) {
nextPage = secondA;
} else {
System.out.println("没有下一页");
}
if (nextPage != null) {
newsUrls.add(nextPage.links().get());
Selectable finalNextPage = nextPage;
process(new Page() {{
setUrl(finalNextPage);
}});
}
return newsUrls;
} }
@Override @Override
@ -70,7 +107,7 @@ public class OschinaBlogPageProcessor implements PageProcessor {
public static void main(String[] args) { public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcessor()) Spider.create(new OschinaBlogPageProcessor())
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm") .addUrl("http://www.nanshan.edu.cn/nyyw.htm")
.addPipeline(new JsonFilePipeline("news/")) .addPipeline(new JsonFilePipeline("news/"))
.run(); .run();
} }

View File

@ -14,7 +14,7 @@ import java.io.OutputStream;
/** /**
* 使HttpClientPDF * 使HttpClientPDF
* @Author: m * @Author: saisai
* @Date: 2023/7/25 11:44 * @Date: 2023/7/25 11:44
* @Version 1.0 * @Version 1.0
*/ */