diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..c32584c
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/news/www.nanshan.edu.cn/a309af708c1947137a611e9580ec9248.json b/news/www.nanshan.edu.cn/a309af708c1947137a611e9580ec9248.json
new file mode 100644
index 0000000..9e26dfe
--- /dev/null
+++ b/news/www.nanshan.edu.cn/a309af708c1947137a611e9580ec9248.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java b/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java
index 67d7769..3c6a2d0 100644
--- a/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java
+++ b/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java
@@ -20,25 +20,37 @@ public class OschinaBlogPageProcessor implements PageProcessor {
@Override
public void process(Page page) {
- // http://www.nanshan.edu.cn/info/1051/8951.htm
- // http://www.nanshan.edu.cn/nyyw.htm
- // http://www.nanshan.edu.cn/nyyw/123.htm
Selectable url1 = page.getUrl();
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
page.getRequest().getUrl();
-// if (!url1.match()) {
if (url1 == null) {
log.error("url为空");
return;
}
log.info(url1.get());
- if (("http://www.nanshan.edu.cn/nyyw.*").matches(url1.get())) {
+ if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) {
// 获取烟台南山学院的新闻Url
List newsUrls = page.getHtml()
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// 存在分页,将下一页url 添加到待采集列表
- String nextPage = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[2]/div/a[2]").links().get();
- newsUrls.add(nextPage);
+ Selectable nextPage = null;
+ Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
+ Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
+
+ if ("下页".equals(firstA.links().get())) {
+ nextPage = firstA;
+ } else if ("下页".equals(secondA.links().get())) {
+ nextPage = secondA;
+ } else {
+ System.out.println("没有下一页");
+ }
+ if (nextPage != null) {
+ newsUrls.add(nextPage.links().get());
+ Selectable finalNextPage = nextPage;
+ process(new Page() {{
+ setUrl(finalNextPage);
+ }});
+ }
// 添加
page.addTargetRequests(newsUrls);
} else {
@@ -48,7 +60,7 @@ public class OschinaBlogPageProcessor implements PageProcessor {
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
String[] split = newsHead.split(":");
String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10);
- String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7);
+ String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7);
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
page.putField("url", url);
page.putField("title", title);
@@ -59,7 +71,32 @@ public class OschinaBlogPageProcessor implements PageProcessor {
List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
System.out.println("页面提取执行完毕");
+ }
+ public List getNewsUrls(Page page) {
+ // 获取烟台南山学院的新闻Url
+ List newsUrls = page.getHtml()
+ .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
+ // 存在分页,将下一页url 添加到待采集列表
+ Selectable nextPage = null;
+ Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()");
+ Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()");
+
+ if ("下页".equals(firstA.links().get())) {
+ nextPage = firstA;
+ } else if ("下页".equals(secondA.links().get())) {
+ nextPage = secondA;
+ } else {
+ System.out.println("没有下一页");
+ }
+ if (nextPage != null) {
+ newsUrls.add(nextPage.links().get());
+ Selectable finalNextPage = nextPage;
+ process(new Page() {{
+ setUrl(finalNextPage);
+ }});
+ }
+ return newsUrls;
}
@Override
@@ -70,7 +107,7 @@ public class OschinaBlogPageProcessor implements PageProcessor {
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcessor())
- .addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
+ .addUrl("http://www.nanshan.edu.cn/nyyw.htm")
.addPipeline(new JsonFilePipeline("news/"))
.run();
}
diff --git a/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java b/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java
index 9b388bb..83f5bbb 100644
--- a/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java
+++ b/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java
@@ -14,7 +14,7 @@ import java.io.OutputStream;
/**
* 该方法使用HttpClient下载图片、PDF、压缩文件等
- * @Author: m
+ * @Author: saisai
* @Date: 2023/7/25 11:44 下午
* @Version 1.0
*/