parent
39db1a4db8
commit
4b631bcd1e
|
@ -2,5 +2,13 @@
|
|||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="AliAccessStaticViaInstance" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="AliArrayNamingShouldHaveBracket" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="AliControlFlowStatementWithoutBraces" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="AliDeprecation" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="AliEqualsAvoidNull" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="AliLongLiteralsEndingWithLowercaseL" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="AliMissingOverrideAnnotation" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="AliWrapperTypeEquality" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="MapOrSetKeyShouldOverrideHashCodeEquals" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
</profile>
|
||||
</component>
|
|
@ -1 +0,0 @@
|
|||
{}
|
6
pom.xml
6
pom.xml
|
@ -39,6 +39,12 @@
|
|||
<version>1.2.17</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.14.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
package com.mobai.webMagic;
|
||||
|
||||
import com.mobai.webMagic.util.ImageDownloaderUtil;
|
||||
import org.apache.commons.lang3.ObjectUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -29,44 +32,56 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
|||
}
|
||||
log.info(url1.get());
|
||||
if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
|
||||
// 获取烟台南山学院的新闻Url
|
||||
List<String> newsUrls = page.getHtml()
|
||||
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
||||
// 存在分页,将下一页url 添加到待采集列表
|
||||
Selectable nextPage = null;
|
||||
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
|
||||
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
|
||||
|
||||
if ("下页".equals(firstA.links().get())) {
|
||||
nextPage = firstA;
|
||||
} else if ("下页".equals(secondA.links().get())) {
|
||||
nextPage = secondA;
|
||||
} else {
|
||||
System.out.println("没有下一页");
|
||||
}
|
||||
if (nextPage != null) {
|
||||
newsUrls.add(nextPage.links().get());
|
||||
Selectable finalNextPage = nextPage;
|
||||
process(new Page() {{
|
||||
setUrl(finalNextPage);
|
||||
}});
|
||||
}
|
||||
// // 获取烟台南山学院的新闻Url
|
||||
// List<String> newsUrls = page.getHtml()
|
||||
// .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
||||
// // 存在分页,将下一页url 添加到待采集列表
|
||||
// Selectable nextPage = null;
|
||||
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
|
||||
// Selectable secondA = null;
|
||||
// if ("首页".equals(firstA)){
|
||||
// secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]/text()");
|
||||
// }
|
||||
// if ("下页".equals(firstA)) {
|
||||
// nextPage = firstA;
|
||||
// } else if ("下页".equals(secondA)) {
|
||||
// nextPage = secondA;
|
||||
// } else {
|
||||
// System.out.println("没有下一页");
|
||||
// }
|
||||
// if (nextPage != null) {
|
||||
// newsUrls.add(nextPage.links().get());
|
||||
// Selectable finalNextPage = nextPage;
|
||||
// process(new Page() {{
|
||||
// setUrl(finalNextPage);
|
||||
// }});
|
||||
// }
|
||||
// 添加
|
||||
page.addTargetRequests(newsUrls);
|
||||
page.addTargetRequests(getNewsUrls(page));
|
||||
page.setSkip(true);
|
||||
} else {
|
||||
String url = page.getUrl().toString();
|
||||
String title = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/h1/text()").get();
|
||||
// 日期等信息需分割 包括 来源
|
||||
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
|
||||
String[] split = newsHead.split(":");
|
||||
String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10);
|
||||
String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7);
|
||||
String date = newsHead.substring(newsHead.indexOf("发布时间")+5, newsHead.indexOf("发布时间") + 15);
|
||||
String source = newsHead.substring(newsHead.indexOf("来源")+3, newsHead.indexOf("点击次数")-2);
|
||||
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
|
||||
page.putField("url", url);
|
||||
page.putField("title", title);
|
||||
page.putField("date", date);
|
||||
page.putField("source", source);
|
||||
page.putField("content", content);
|
||||
|
||||
|
||||
//获取图片对应的URL
|
||||
List<String> urlList = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/div/p/img/@src").all();
|
||||
// 下载每一张图片
|
||||
for (int i = 0; i < urlList.size(); i++) {
|
||||
// http://www.nanshan.edu.cn/__local/2/4D/93/76F03CB6EBE3BFB7018397B37D4_C03E3A0C_36A9C.png
|
||||
String inputUrl = "http://www.nanshan.edu.cn" + urlList.get(i);
|
||||
ImageDownloaderUtil.downLoadImage(inputUrl, "D:/workspace/web-magic-test/news/img/" + inputUrl.split("/")[7]);
|
||||
}
|
||||
}
|
||||
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
|
||||
page.addTargetRequests(links);
|
||||
|
@ -79,22 +94,30 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
|||
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
|
||||
// 存在分页,将下一页url 添加到待采集列表
|
||||
Selectable nextPage = null;
|
||||
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
|
||||
Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
|
||||
|
||||
if ("下页".equals(firstA.links().get())) {
|
||||
nextPage = firstA;
|
||||
} else if ("下页".equals(secondA.links().get())) {
|
||||
nextPage = secondA;
|
||||
// Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]/text()");
|
||||
Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[1]");
|
||||
Selectable secondA = null;
|
||||
if (firstA.get().contains("首页")) {
|
||||
try {
|
||||
secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td[2]/div/a[3]");
|
||||
} catch (Exception e) {
|
||||
log.error("没有下一页了");
|
||||
}
|
||||
}
|
||||
if (firstA.get().contains("下页")) {
|
||||
nextPage = firstA.links();
|
||||
} else if (ObjectUtils.isNotEmpty(secondA) && secondA.get().contains("下页")) {
|
||||
nextPage = secondA.links();
|
||||
} else {
|
||||
System.out.println("没有下一页");
|
||||
}
|
||||
if (nextPage != null) {
|
||||
newsUrls.add(nextPage.links().get());
|
||||
Selectable finalNextPage = nextPage;
|
||||
process(new Page() {{
|
||||
setUrl(finalNextPage);
|
||||
}});
|
||||
// Selectable finalNextPage = nextPage;
|
||||
// newsUrls.addAll(getNewsUrls(new Page() {{
|
||||
// setUrl(finalNextPage);
|
||||
// setHtml(new Html("",finalNextPage.get()));
|
||||
// }}));
|
||||
newsUrls.add(nextPage.get());
|
||||
}
|
||||
return newsUrls;
|
||||
}
|
||||
|
@ -102,12 +125,11 @@ public class OschinaBlogPageProcessor implements PageProcessor {
|
|||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new OschinaBlogPageProcessor())
|
||||
.addUrl("http://www.nanshan.edu.cn/nyyw.htm")
|
||||
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
|
||||
.addPipeline(new JsonFilePipeline("news/"))
|
||||
.run();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
package com.mobai.webMagic.util;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class ImageProcessor implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setTimeOut(10000);
|
||||
|
||||
public void process(Page page) {
|
||||
//获取图片对应的URL
|
||||
List<String> urlList = page.getHtml().$("div[id=content]").xpath("//div/a/img/@src").all();
|
||||
// 下载每一张图片
|
||||
for (int i = 0; i < urlList.size(); i++) {
|
||||
// http://www.nanshan.edu.cn/__local/2/4D/93/76F03CB6EBE3BFB7018397B37D4_C03E3A0C_36A9C.png
|
||||
String inputUrl = "http://www.nanshan.edu.cn" + urlList.get(i);
|
||||
ImageDownloaderUtil.downLoadImage(inputUrl, "image/" + inputUrl.split("/")[5]);
|
||||
}
|
||||
//获取下一页的url
|
||||
String next = page.getHtml().xpath("//*[@id=\"content\"]/ul[2]/li[@class='nextPage']").links().get();
|
||||
page.addTargetRequest(next);
|
||||
}
|
||||
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new ImageProcessor())
|
||||
.addUrl("https://www.socwall.com")
|
||||
.thread(5)
|
||||
.run();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue