feat():爬取新闻中的图片文件

eg:http://www.nanshan.edu.cn/info/1051/8981.htm
master
Saisai 2024-08-16 14:59:14 +08:00
parent 39db1a4db8
commit 4b631bcd1e
6 changed files with 116 additions and 42 deletions

View File

@ -2,5 +2,13 @@
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="AliAccessStaticViaInstance" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="AliArrayNamingShouldHaveBracket" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="AliControlFlowStatementWithoutBraces" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="AliDeprecation" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="AliEqualsAvoidNull" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="AliLongLiteralsEndingWithLowercaseL" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="AliMissingOverrideAnnotation" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="AliWrapperTypeEquality" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="MapOrSetKeyShouldOverrideHashCodeEquals" enabled="true" level="WARNING" enabled_by_default="true" />
</profile>
</component>

View File

@ -39,6 +39,12 @@
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.14.0</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>

View File

@ -1,11 +1,14 @@
package com.mobai.webMagic;
import com.mobai.webMagic.util.ImageDownloaderUtil;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
@ -29,44 +32,56 @@ public class OschinaBlogPageProcessor implements PageProcessor {
}
log.info(url1.get());
if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
// 获取烟台南山学院的新闻Url
List<String> newsUrls = page.getHtml()
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// 存在分页,将下一页url 添加到待采集列表
Selectable nextPage = null;
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
if ("下页".equals(firstA.links().get())) {
nextPage = firstA;
} else if ("下页".equals(secondA.links().get())) {
nextPage = secondA;
} else {
System.out.println("没有下一页");
}
if (nextPage != null) {
newsUrls.add(nextPage.links().get());
Selectable finalNextPage = nextPage;
process(new Page() {{
setUrl(finalNextPage);
}});
}
// // 获取烟台南山学院的新闻Url
// List<String> newsUrls = page.getHtml()
// .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// // 存在分页,将下一页url 添加到待采集列表
// Selectable nextPage = null;
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
// Selectable secondA = null;
// if ("首页".equals(firstA)){
// secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]/text()");
// }
// if ("下页".equals(firstA)) {
// nextPage = firstA;
// } else if ("下页".equals(secondA)) {
// nextPage = secondA;
// } else {
// System.out.println("没有下一页");
// }
// if (nextPage != null) {
// newsUrls.add(nextPage.links().get());
// Selectable finalNextPage = nextPage;
// process(new Page() {{
// setUrl(finalNextPage);
// }});
// }
// 添加
page.addTargetRequests(newsUrls);
page.addTargetRequests(getNewsUrls(page));
page.setSkip(true);
} else {
String url = page.getUrl().toString();
String title = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/h1/text()").get();
// 日期等信息需分割 包括 来源
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
String[] split = newsHead.split("");
String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10);
String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7);
String date = newsHead.substring(newsHead.indexOf("发布时间")+5, newsHead.indexOf("发布时间") + 15);
String source = newsHead.substring(newsHead.indexOf("来源")+3, newsHead.indexOf("点击次数")-2);
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
page.putField("url", url);
page.putField("title", title);
page.putField("date", date);
page.putField("source", source);
page.putField("content", content);
//获取图片对应的URL
List<String> urlList = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/div/p/img/@src").all();
// 下载每一张图片
for (int i = 0; i < urlList.size(); i++) {
// http://www.nanshan.edu.cn/__local/2/4D/93/76F03CB6EBE3BFB7018397B37D4_C03E3A0C_36A9C.png
String inputUrl = "http://www.nanshan.edu.cn" + urlList.get(i);
ImageDownloaderUtil.downLoadImage(inputUrl, "D:/workspace/web-magic-test/news/img/" + inputUrl.split("/")[7]);
}
}
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
@ -79,22 +94,30 @@ public class OschinaBlogPageProcessor implements PageProcessor {
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// 存在分页,将下一页url 添加到待采集列表
Selectable nextPage = null;
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
if ("下页".equals(firstA.links().get())) {
nextPage = firstA;
} else if ("下页".equals(secondA.links().get())) {
nextPage = secondA;
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]");
Selectable secondA = null;
if (firstA.get().contains("首页")) {
try {
secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]");
} catch (Exception e) {
log.error("没有下一页了");
}
}
if (firstA.get().contains("下页")) {
nextPage = firstA.links();
} else if (ObjectUtils.isNotEmpty(secondA) && secondA.get().contains("下页")) {
nextPage = secondA.links();
} else {
System.out.println("没有下一页");
}
if (nextPage != null) {
newsUrls.add(nextPage.links().get());
Selectable finalNextPage = nextPage;
process(new Page() {{
setUrl(finalNextPage);
}});
// Selectable finalNextPage = nextPage;
// newsUrls.addAll(getNewsUrls(new Page() {{
// setUrl(finalNextPage);
// setHtml(new Html("",finalNextPage.get()));
// }}));
newsUrls.add(nextPage.get());
}
return newsUrls;
}
@ -102,12 +125,11 @@ public class OschinaBlogPageProcessor implements PageProcessor {
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcessor())
.addUrl("http://www.nanshan.edu.cn/nyyw.htm")
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
.addPipeline(new JsonFilePipeline("news/"))
.run();
}

View File

@ -0,0 +1,39 @@
package com.mobai.webMagic.util;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
public class ImageProcessor implements PageProcessor {
private Site site = Site.me().setTimeOut(10000);
public void process(Page page) {
//获取图片对应的URL
List<String> urlList = page.getHtml().$("div[id=content]").xpath("//div/a/img/@src").all();
// 下载每一张图片
for (int i = 0; i < urlList.size(); i++) {
// http://www.nanshan.edu.cn/__local/2/4D/93/76F03CB6EBE3BFB7018397B37D4_C03E3A0C_36A9C.png
String inputUrl = "http://www.nanshan.edu.cn" + urlList.get(i);
ImageDownloaderUtil.downLoadImage(inputUrl, "image/" + inputUrl.split("/")[5]);
}
//获取下一页的url
String next = page.getHtml().xpath("//*[@id=\"content\"]/ul[2]/li[@class='nextPage']").links().get();
page.addTargetRequest(next);
}
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new ImageProcessor())
.addUrl("https://www.socwall.com")
.thread(5)
.run();
}
}