From 39db1a4db8070441b8bd52dd47c7361fc22573ef Mon Sep 17 00:00:00 2001 From: Saisai Date: Mon, 12 Aug 2024 13:58:50 +0800 Subject: [PATCH] =?UTF-8?q?feat():=E8=8E=B7=E5=8F=96=E9=A1=B5=E9=9D=A2?= =?UTF-8?q?=E5=86=85=E9=83=A8=E9=93=BE=E6=8E=A5=E8=8E=B7=E5=8F=96=E6=96=B0?= =?UTF-8?q?=E9=97=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/inspectionProfiles/Project_Default.xml | 6 ++ .idea/vcs.xml | 6 ++ .../a309af708c1947137a611e9580ec9248.json | 1 + .../webMagic/OschinaBlogPageProcessor.java | 55 ++++++++++++++++--- .../webMagic/util/ImageDownloaderUtil.java | 2 +- 5 files changed, 60 insertions(+), 10 deletions(-) create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/vcs.xml create mode 100644 news/www.nanshan.edu.cn/a309af708c1947137a611e9580ec9248.json diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..c32584c --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/news/www.nanshan.edu.cn/a309af708c1947137a611e9580ec9248.json b/news/www.nanshan.edu.cn/a309af708c1947137a611e9580ec9248.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/news/www.nanshan.edu.cn/a309af708c1947137a611e9580ec9248.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java b/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java index 67d7769..3c6a2d0 100644 --- a/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java +++ b/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java @@ -20,25 +20,37 @@ public class OschinaBlogPageProcessor implements PageProcessor { @Override public void process(Page page) { - // http://www.nanshan.edu.cn/info/1051/8951.htm - // http://www.nanshan.edu.cn/nyyw.htm - // http://www.nanshan.edu.cn/nyyw/123.htm Selectable url1 = page.getUrl(); // Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*"); page.getRequest().getUrl(); -// if (!url1.match()) { if (url1 == null) { log.error("url为空"); return; } log.info(url1.get()); - if (("http://www.nanshan.edu.cn/nyyw.*").matches(url1.get())) { + if (url1.get().matches("http://www.nanshan.edu.cn/nyyw.*")) { // 获取烟台南山学院的新闻Url List newsUrls = page.getHtml() .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all(); // 存在分页,将下一页url 添加到待采集列表 - String nextPage = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[2]/div/a[2]").links().get(); - newsUrls.add(nextPage); + Selectable nextPage = null; + Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()"); + Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()"); + + if ("下页".equals(firstA.links().get())) { + nextPage = firstA; + } else if ("下页".equals(secondA.links().get())) { + nextPage = secondA; + } else { + System.out.println("没有下一页"); + } + if (nextPage != null) { + newsUrls.add(nextPage.links().get()); + Selectable finalNextPage = nextPage; + process(new Page() {{ + setUrl(finalNextPage); + }}); + } // 添加 page.addTargetRequests(newsUrls); } else { @@ -48,7 +60,7 @@ public class OschinaBlogPageProcessor implements PageProcessor { String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get(); String[] split = newsHead.split(":"); String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10); - String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7); + String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7); String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get(); page.putField("url", url); page.putField("title", title); @@ -59,7 +71,32 @@ public class OschinaBlogPageProcessor implements PageProcessor { List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); System.out.println("页面提取执行完毕"); + } + public List getNewsUrls(Page page) { + // 获取烟台南山学院的新闻Url + List newsUrls = page.getHtml() + .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all(); + // 存在分页,将下一页url 添加到待采集列表 + Selectable nextPage = null; + Selectable firstA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[1]/text()"); + Selectable secondA = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[3]/div/a[3]/test()"); + + if ("下页".equals(firstA.links().get())) { + nextPage = firstA; + } else if ("下页".equals(secondA.links().get())) { + nextPage = secondA; + } else { + System.out.println("没有下一页"); + } + if (nextPage != null) { + newsUrls.add(nextPage.links().get()); + Selectable finalNextPage = nextPage; + process(new Page() {{ + setUrl(finalNextPage); + }}); + } + return newsUrls; } @Override @@ -70,7 +107,7 @@ public class OschinaBlogPageProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new OschinaBlogPageProcessor()) - .addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm") + .addUrl("http://www.nanshan.edu.cn/nyyw.htm") .addPipeline(new JsonFilePipeline("news/")) .run(); } diff --git a/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java b/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java index 9b388bb..83f5bbb 100644 --- a/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java +++ b/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java @@ -14,7 +14,7 @@ import java.io.OutputStream; /** * 该方法使用HttpClient下载图片、PDF、压缩文件等 - * @Author: m + * @Author: saisai * @Date: 2023/7/25 11:44 下午 * @Version 1.0 */