feat():测试成功爬虫框架

subject:获取烟台南山学院单个新闻信息并存如news文件夹
master
Saisai 2024-08-12 11:13:18 +08:00
commit 80efdc800a
11 changed files with 435 additions and 0 deletions

38
.gitignore vendored 100644
View File

@ -0,0 +1,38 @@
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### IntelliJ IDEA ###
.idea/modules.xml
.idea/jarRepositories.xml
.idea/compiler.xml
.idea/libraries/
*.iws
*.iml
*.ipr
### Eclipse ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
### Mac OS ###
.DS_Store

8
.idea/.gitignore vendored 100644
View File

@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>

64
.idea/misc.xml 100644
View File

@ -0,0 +1,64 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectInspectionProfilesVisibleTreeState">
<entry key="Project Default">
<profile-state>
<expanded-state>
<State />
<State>
<id>Android</id>
</State>
<State>
<id>CorrectnessLintAndroid</id>
</State>
<State>
<id>FreeMarker</id>
</State>
<State>
<id>HTML</id>
</State>
<State>
<id>JSP检查</id>
</State>
<State>
<id>Java EE</id>
</State>
<State>
<id>JavaScript 和 TypeScript</id>
</State>
<State>
<id>LintAndroid</id>
</State>
<State>
<id>RESTful Web 服务(JAX-RS)</id>
</State>
<State>
<id>SecurityLintAndroid</id>
</State>
<State>
<id>UsabilityLintAndroid</id>
</State>
<State>
<id>常规JavaScript 和 TypeScript</id>
</State>
</expanded-state>
<selected-state>
<State>
<id>Android</id>
</State>
</selected-state>
</profile-state>
</entry>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

View File

@ -0,0 +1,124 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Palette2">
<group name="Swing">
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
</item>
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
</item>
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.svg" removable="false" auto-create-binding="false" can-attach-label="true">
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
</item>
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
<initial-values>
<property name="text" value="Button" />
</initial-values>
</item>
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="RadioButton" />
</initial-values>
</item>
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="CheckBox" />
</initial-values>
</item>
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
<initial-values>
<property name="text" value="Label" />
</initial-values>
</item>
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
</item>
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
</item>
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
<preferred-size width="-1" height="20" />
</default-constraints>
</item>
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
</item>
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
</item>
</group>
</component>
</project>

View File

@ -0,0 +1 @@
{"url":"http://www.nanshan.edu.cn/info/1051/8981.htm","title":"烟台南山学院纺织与服装学院党总支荣获“烟台市先进基层党组织”","date":"2024-07-05","source":"纺织与服装学院","content":"中国共产党成立103周年前夕为表彰先进、树立标杆、凝聚力量中共烟台市委公布烟台市优秀共产党员、优秀党务工作者、先进基层党组织表彰对象名单。烟台南山学院纺织与服装学院党总支被表彰为“烟台市先进基层党组织”。 近年来,烟台南山学院纺织与服装学院党总支坚持落实立德树人根本任务,紧紧围绕“党建引领、立德树人、校企一体、协同育人”办学理念布局工作,积极打造“匠心智尚·中国结”党建品牌,推动学院各项工作高质量发展。 在党总支引领下纺织与服装学院教育教学成果丰硕教科研工作质量显著提升。党总支与龙口市下丁家镇机关党支部联建开展“联建共建强党建凝心聚力促发展”等主题活动深化校企融合与山东南山智尚科技股份有限公司党支部联建共建实现组织联建、科研联攻、人才联动、效益联创深化产业合作成立“黄河流域纺织服装校企科技创新联盟”成立“智尚”纺织服装产业学院。学院共发表学术论文100余篇授权专利50余项获批山东省基层教学组织1项山东省高等教育示范性实习实训基地1项山东省一流本科专业建设点1项省级教研项目14项“纺织之光”教学成果奖15项山东省工程研究中心1项省级科研平台4个山东省高等学校课程思政教学改革研究项目1项山东省本科教学改革研究重点项目1项荣获省部级科技奖励10余项。 纺织与服装学院党总支将继续深入学习贯彻党的二十大精神,牢记为党育人、为国育才使命,深化“党建+教育”工作,依托校企地共建联建,加快学院高质量发展,奋力书写高水平应用型大学育人新篇章。"}

48
pom.xml 100644
View File

@ -0,0 +1,48 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mobai</groupId>
<artifactId>web-magic-test</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- 构建springboot项目 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
<version>2.7.15</version>
</dependency>
<!-- 网页索取依赖 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.28</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,17 @@
package com.mobai;
//TIP 要<b>运行</b>代码,请按 <shortcut actionId="Run"/> 或
// 点击装订区域中的 <icon src="AllIcons.Actions.Execute"/> 图标。
public class Main {
public static void main(String[] args) {
//TIP 当文本光标位于高亮显示的文本处时按 <shortcut actionId="ShowIntentionActions"/>
// 查看 IntelliJ IDEA 建议如何修正。
System.out.printf("Hello and welcome!");
for (int i = 1; i <= 5; i++) {
//TIP 按 <shortcut actionId="Debug"/> 开始调试代码。我们已经设置了一个 <icon src="AllIcons.Debugger.Db_set_breakpoint"/> 断点
// 但您始终可以通过按 <shortcut actionId="ToggleLineBreakpoint"/> 添加更多断点。
System.out.println("i = " + i);
}
}
}

View File

@ -0,0 +1,77 @@
package com.mobai.webMagic;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
public class OschinaBlogPageProcessor implements PageProcessor {
private static final Logger log = Logger.getLogger(OschinaBlogPageProcessor.class);
// private Site site = Site.me().setDomain("my.oschina.net");
private Site site = Site.me().setRetryTimes(3)
// .setSleepTime(24*60*60*1000);
.setSleepTime(1000);
@Override
public void process(Page page) {
// http://www.nanshan.edu.cn/info/1051/8951.htm
// http://www.nanshan.edu.cn/nyyw.htm
// http://www.nanshan.edu.cn/nyyw/123.htm
Selectable url1 = page.getUrl();
// Selectable xpath = page.getUrl().xpath("http://www.nanshan.edu.cn/nyyw.123.*");
page.getRequest().getUrl();
// if (!url1.match()) {
if (url1 == null) {
log.error("url为空");
return;
}
log.info(url1.get());
if (("http://www.nanshan.edu.cn/nyyw.*").matches(url1.get())) {
// 获取烟台南山学院的新闻Url
List<String> newsUrls = page.getHtml()
.xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/ul/li/a").links().all();
// 存在分页,将下一页url 添加到待采集列表
String nextPage = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[5]/div/div[2]/div/table/tbody/tr/table/tr/td[2]/div/a[2]").links().get();
newsUrls.add(nextPage);
// 添加
page.addTargetRequests(newsUrls);
} else {
String url = page.getUrl().toString();
String title = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/h1/text()").get();
// 日期等信息需分割 包括 来源
String newsHead = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[1]/text()").get();
String[] split = newsHead.split("");
String date = newsHead.substring(newsHead.indexOf(split[2]), newsHead.indexOf(split[2]) + 10);
String source = newsHead.substring(newsHead.indexOf(split[5]), newsHead.indexOf(split[5]) + 7);
String content = page.getHtml().xpath("/html/body/div[5]/div[1]/div[1]/div[2]/form/div/div[2]/allText()").get();
page.putField("url", url);
page.putField("title", title);
page.putField("date", date);
page.putField("source", source);
page.putField("content", content);
}
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
System.out.println("页面提取执行完毕");
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcessor())
.addUrl("http://www.nanshan.edu.cn/info/1051/8981.htm")
.addPipeline(new JsonFilePipeline("news/"))
.run();
}
}

View File

@ -0,0 +1,50 @@
package com.mobai.webMagic.util;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
/**
* 使HttpClientPDF
* @Author: m
* @Date: 2023/7/25 11:44
* @Version 1.0
*/
public class ImageDownloaderUtil {
private static Logger logger = LoggerFactory.getLogger(ImageDownloaderUtil.class);
public static synchronized void downLoadImage(String url, String fileName) {
//初始化HttpClient
HttpClient httpClient = HttpClients.custom().build();
HttpGet httpGet = new HttpGet(url);
//获取结果
HttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(httpGet);
} catch (IOException e) {
logger.warn("execute http request fail:", e);
}
//非常简单的下载方法
try {
OutputStream out = new FileOutputStream(fileName);
httpResponse.getEntity().writeTo(out);
} catch (Exception e) {
logger.warn("save file fail:", e);
}
try {
//消耗实体
EntityUtils.consume(httpResponse.getEntity());
} catch (IOException e) {
logger.warn("consume entity fail:", e);
}
}
}