commit 80efdc800a7c557deddba57ad5faf79655a850b1 Author: Saisai Date: Mon Aug 12 11:13:18 2024 +0800 feat():测试成功爬虫框架 subject:获取烟台南山学院单个新闻信息并存如news文件夹 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ff6309 --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ + +### IntelliJ IDEA ### +.idea/modules.xml +.idea/jarRepositories.xml +.idea/compiler.xml +.idea/libraries/ +*.iws +*.iml +*.ipr + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ +!**/src/main/**/build/ +!**/src/test/**/build/ + +### VS Code ### +.vscode/ + +### Mac OS ### +.DS_Store \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..35410ca --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..aa00ffa --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a00d84f --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,64 @@ + + + + + + + + + + + + + Android + + + CorrectnessLintAndroid + + + FreeMarker + + + HTML + + + JSP检查 + + + Java EE + + + JavaScript 和 TypeScript + + + LintAndroid + + + RESTful Web 服务(JAX-RS) + + + SecurityLintAndroid + + + UsabilityLintAndroid + + + 常规JavaScript 和 TypeScript + + + + + Android + + + + + + + + + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000..2b63946 --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/news/www.nanshan.edu.cn/4dc4cc62295b4354d089e3899b0111a1.json b/news/www.nanshan.edu.cn/4dc4cc62295b4354d089e3899b0111a1.json new file mode 100644 index 0000000..4203f6e --- /dev/null +++ b/news/www.nanshan.edu.cn/4dc4cc62295b4354d089e3899b0111a1.json @@ -0,0 +1 @@ +{"url":"http://www.nanshan.edu.cn/info/1051/8981.htm","title":"烟台南山学院纺织与服装学院党总支荣获“烟台市先进基层党组织”","date":"2024-07-05","source":"纺织与服装学院","content":"中国共产党成立103周年前夕,为表彰先进、树立标杆、凝聚力量,中共烟台市委公布烟台市优秀共产党员、优秀党务工作者、先进基层党组织表彰对象名单。烟台南山学院纺织与服装学院党总支被表彰为“烟台市先进基层党组织”。 近年来,烟台南山学院纺织与服装学院党总支坚持落实立德树人根本任务,紧紧围绕“党建引领、立德树人、校企一体、协同育人”办学理念布局工作,积极打造“匠心智尚·中国结”党建品牌,推动学院各项工作高质量发展。 在党总支引领下,纺织与服装学院教育教学成果丰硕,教科研工作质量显著提升。党总支与龙口市下丁家镇机关党支部联建,开展“联建共建强党建,凝心聚力促发展”等主题活动;深化校企融合,与山东南山智尚科技股份有限公司党支部联建共建,实现组织联建、科研联攻、人才联动、效益联创;深化产业合作,成立“黄河流域纺织服装校企科技创新联盟”,成立“智尚”纺织服装产业学院。学院共发表学术论文100余篇,授权专利50余项,获批山东省基层教学组织1项,山东省高等教育示范性实习(实训)基地1项,山东省一流本科专业建设点1项,省级教研项目14项,“纺织之光”教学成果奖15项,山东省工程研究中心1项,省级科研平台4个,山东省高等学校课程思政教学改革研究项目1项,山东省本科教学改革研究重点项目1项,荣获省部级科技奖励10余项。 纺织与服装学院党总支将继续深入学习贯彻党的二十大精神,牢记为党育人、为国育才使命,深化“党建+教育”工作,依托校企地共建联建,加快学院高质量发展,奋力书写高水平应用型大学育人新篇章。"} \ No newline at end of file diff --git a/news/www.nanshan.edu.cn/ea625b9b499049c11b937ce94ea00973.json b/news/www.nanshan.edu.cn/ea625b9b499049c11b937ce94ea00973.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/news/www.nanshan.edu.cn/ea625b9b499049c11b937ce94ea00973.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..1ec7ce9 --- /dev/null +++ b/pom.xml @@ -0,0 +1,48 @@ + + + 4.0.0 + + com.mobai + web-magic-test + 1.0-SNAPSHOT + + + 8 + 8 + UTF-8 + + + + + + org.springframework.boot + spring-boot-starter + 2.7.15 + + + + us.codecraft + webmagic-core + 0.7.3 + + + us.codecraft + webmagic-extension + 0.7.3 + + + + log4j + log4j + 1.2.17 + + + + org.projectlombok + lombok + 1.18.28 + + + \ No newline at end of file diff --git a/src/main/java/com/mobai/Main.java b/src/main/java/com/mobai/Main.java new file mode 100644 index 0000000..c354664 --- /dev/null +++ b/src/main/java/com/mobai/Main.java @@ -0,0 +1,17 @@ +package com.mobai; + +//TIP 要运行代码,请按 或 +// 点击装订区域中的 图标。 +public class Main { + public static void main(String[] args) { + //TIP 当文本光标位于高亮显示的文本处时按 + // 查看 IntelliJ IDEA 建议如何修正。 + System.out.printf("Hello and welcome!"); + + for (int i = 1; i <= 5; i++) { + //TIP 按 开始调试代码。我们已经设置了一个 断点 + // 但您始终可以通过按 添加更多断点。 + System.out.println("i = " + i); + } + } +} \ No newline at end of file diff --git a/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java b/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java new file mode 100644 index 0000000..67d7769 --- /dev/null +++ b/src/main/java/com/mobai/webMagic/OschinaBlogPageProcessor.java @@ -0,0 +1,77 @@ +package com.mobai.webMagic; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.JsonFilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selectable; + +import java.util.List; + +public class OschinaBlogPageProcessor implements PageProcessor { + + private static final Logger log = Logger.getLogger(OschinaBlogPageProcessor.class); + // private Site site = Site.me().setDomain("my.oschina.net"); + private Site site = Site.me().setRetryTimes(3) +// .setSleepTime(24*60*60*1000); + .setSleepTime(1000); + + @Override + public void process(Page page) { + // http://www.nanshan.edu.cn/info/1051/8951.htm + // http://www.nanshan.edu.cn/nyyw.htm + // http://www.nanshan.edu.cn/nyyw/123.htm + Selectable url1 = page.getUrl(); +// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*"); + page.getRequest().getUrl(); +// if (!url1.match()) { + if (url1 == null) { + log.error("url为空"); + return; + } + log.info(url1.get()); + if (("http://www.nanshan.edu.cn/nyyw.*").matches(url1.get())) { + // 获取烟台南山学院的新闻Url + List newsUrls = page.getHtml() + .xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all(); + // 存在分页,将下一页url 添加到待采集列表 + String nextPage = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[2]/div/a[2]").links().get(); + newsUrls.add(nextPage); + // 添加 + page.addTargetRequests(newsUrls); + } else { + String url = page.getUrl().toString(); + String title = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/h1/text()").get(); + // 日期等信息需分割 包括 来源 + String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get(); + String[] split = newsHead.split(":"); + String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10); + String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7); + String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get(); + page.putField("url", url); + page.putField("title", title); + page.putField("date", date); + page.putField("source", source); + page.putField("content", content); + } + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + System.out.println("页面提取执行完毕"); + + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcessor()) + .addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm") + .addPipeline(new JsonFilePipeline("news/")) + .run(); + } +} \ No newline at end of file diff --git a/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java b/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java new file mode 100644 index 0000000..9b388bb --- /dev/null +++ b/src/main/java/com/mobai/webMagic/util/ImageDownloaderUtil.java @@ -0,0 +1,50 @@ +package com.mobai.webMagic.util; + +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +/** + * 该方法使用HttpClient下载图片、PDF、压缩文件等 + * @Author: m + * @Date: 2023/7/25 11:44 下午 + * @Version 1.0 + */ +public class ImageDownloaderUtil { + + private static Logger logger = LoggerFactory.getLogger(ImageDownloaderUtil.class); + + public static synchronized void downLoadImage(String url, String fileName) { + //初始化HttpClient + HttpClient httpClient = HttpClients.custom().build(); + HttpGet httpGet = new HttpGet(url); + //获取结果 + HttpResponse httpResponse = null; + try { + httpResponse = httpClient.execute(httpGet); + } catch (IOException e) { + logger.warn("execute http request fail:", e); + } + //非常简单的下载方法 + try { + OutputStream out = new FileOutputStream(fileName); + httpResponse.getEntity().writeTo(out); + } catch (Exception e) { + logger.warn("save file fail:", e); + } + try { + //消耗实体 + EntityUtils.consume(httpResponse.getEntity()); + } catch (IOException e) { + logger.warn("consume entity fail:", e); + } + } +}