Compare commits

...

13 Commits

Author SHA1 Message Date
Saisai 5016700658 feat():爬虫框架 2024-08-12 11:03:09 +08:00
Joe Zhou 9d75cce16d Merge branch 'release/1.0.0' 2024-07-05 00:27:50 +08:00
Joe Zhou 3e9cd9b5c3 Update versions for release 2024-07-05 00:20:28 +08:00
Niu_XZ 4d0cdb011f
stopWhenComplete,增加动态修改完成时停止方法。 (#1169)
Co-authored-by: niuxiaozu <niuxiaozu@yeah.net>
2024-06-17 17:27:28 +08:00
Joe Zhou 49a5efff46 Add a private constructor to hide the implicit public one. 2024-06-04 01:02:45 +08:00
Joe Zhou 5c43e36118 Make sure the contentType of detectCharset could be null. 2024-06-04 00:59:30 +08:00
Joe Zhou d2aebc60a7 Make getCharset to support null parameter. 2024-06-04 00:57:28 +08:00
Sutra Zhou 7d2d2244b3 Upgrade oxerr-parent from 2.1.0 to 2.2.1. 2024-05-21 12:55:05 +08:00
Sutra Zhou 16a4fe3e28 Use oxerr-parent instead. 2024-05-17 13:17:13 +08:00
Sutra Zhou 4ee30c0592 Merge tag 'WebMagic-0.10.3' into develop
Tag hotfix
2024-04-23 23:41:18 +08:00
Sutra Zhou f7fdcd025f Merge branch 'hotfix/0.10.3' 2024-04-23 23:39:16 +08:00
Sutra Zhou 462c60fef2 Fix for entity is null. 2024-04-23 23:37:46 +08:00
Sutra Zhou 54aef0f0e0 Update versions for hotfix 2024-04-23 23:34:15 +08:00
24 changed files with 204 additions and 288 deletions

8
.idea/.gitignore vendored 100644
View File

@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,10 @@
<component name="ProjectCodeStyleConfiguration">
<code_scheme name="Project" version="173">
<JetCodeStyleSettings>
<option name="CODE_STYLE_DEFAULTS" value="KOTLIN_OFFICIAL" />
</JetCodeStyleSettings>
<codeStyleSettings language="kotlin">
<option name="CODE_STYLE_DEFAULTS" value="KOTLIN_OFFICIAL" />
</codeStyleSettings>
</code_scheme>
</component>

View File

@ -0,0 +1,5 @@
<component name="ProjectCodeStyleConfiguration">
<state>
<option name="USE_PER_PROJECT_SETTINGS" value="true" />
</state>
</component>

18
.idea/compiler.xml 100644
View File

@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="webmagic-samples" />
<module name="webmagic-saxon" />
<module name="webmagic-core" />
<module name="webmagic-extension" />
<module name="webmagic-scripts" />
<module name="webmagic-selenium" />
</profile>
</annotationProcessing>
</component>
</project>

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-core/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-core/src/main/resources" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-coverage/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-coverage/src/main/resources" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-extension/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-extension/src/main/resources" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-samples/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-samples/src/main/resources" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-saxon/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-saxon/src/main/resources" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-scripts/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-scripts/src/main/resources" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-selenium/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/webmagic-selenium/src/main/resources" charset="UTF-8" />
</component>
</project>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="https://maven.aliyun.com/repository/central" />
</remote-repository>
</component>
</project>

62
.idea/misc.xml 100644
View File

@ -0,0 +1,62 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectInspectionProfilesVisibleTreeState">
<entry key="Project Default">
<profile-state>
<expanded-state>
<State />
<State>
<id>Android</id>
</State>
<State>
<id>CorrectnessLintAndroid</id>
</State>
<State>
<id>FreeMarker</id>
</State>
<State>
<id>HTML</id>
</State>
<State>
<id>JSP检查</id>
</State>
<State>
<id>Java EE</id>
</State>
<State>
<id>JavaScript 和 TypeScript</id>
</State>
<State>
<id>LintAndroid</id>
</State>
<State>
<id>RESTful Web 服务(JAX-RS)</id>
</State>
<State>
<id>SecurityLintAndroid</id>
</State>
<State>
<id>UsabilityLintAndroid</id>
</State>
<State>
<id>常规JavaScript 和 TypeScript</id>
</State>
</expanded-state>
<selected-state>
<State>
<id>Android</id>
</State>
</selected-state>
</profile-state>
</entry>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" project-jdk-name="17" project-jdk-type="JavaSDK" />
</project>

6
.idea/vcs.xml 100644
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

283
pom.xml
View File

@ -5,9 +5,14 @@
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.10.3</version>
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.oxerr</groupId>
<artifactId>oxerr-parent</artifactId>
<version>2.2.1</version>
</parent>
<groupId>us.codecraft</groupId>
<version>1.0.0</version>
<packaging>pom</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@ -272,73 +277,6 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<executions>
<execution>
<id>enforce-maven</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireMavenVersion>
<version>3.6.3</version>
</requireMavenVersion>
</rules>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
</plugin>
<!--<plugin>-->
<!--<groupId>org.apache.maven.plugins</groupId>-->
<!--<artifactId>maven-dependency-plugin</artifactId>-->
<!--<version>2.8</version>-->
<!--<executions>-->
<!--<execution>-->
<!--<id>copy-dependencies</id>-->
<!--<phase>package</phase>-->
<!--<goals>-->
<!--<goal>copy-dependencies</goal>-->
<!--</goals>-->
<!--<configuration>-->
<!--<outputDirectory>${project.build.directory}/lib</outputDirectory>-->
<!--<overWriteReleases>false</overWriteReleases>-->
<!--<overWriteSnapshots>false</overWriteSnapshots>-->
<!--<overWriteIfNewer>true</overWriteIfNewer>-->
<!--</configuration>-->
<!--</execution>-->
<!--</executions>-->
<!--</plugin>-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
@ -366,10 +304,6 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
@ -398,209 +332,6 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-clean-plugin</artifactId>
<version>3.3.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.13.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.1.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.4.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<version>3.1.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.4.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.6.3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
<version>3.3.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<version>3.21.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>3.0.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.3.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>4.0.0-M13</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.3.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.5</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId>
<version>3.2.5</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>taglist-maven-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.12</version>
</plugin>
<plugin>
<groupId>com.amashchenko.maven.plugin</groupId>
<artifactId>gitflow-maven-plugin</artifactId>
<version>1.21.0</version>
</plugin>
<plugin>
<groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId>
<version>4.8.4.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<configuration>
<doclint>none</doclint>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>taglist-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId>
</plugin>
</plugins>
</reporting>
<profiles>
<profile>
<id>release</id>
<build>
<plugins>
<!-- Source -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.3.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- Javadoc -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.4.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- GPG -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>sonatype-nexus-staging</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
</plugins>
</build>
<distributionManagement>
<snapshotRepository>
<id>sonatype-nexus-snapshots</id>
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
</snapshotRepository>
<repository>
<id>sonatype-nexus-staging</id>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>
</distributionManagement>
</profile>
</profiles>
</project>

View File

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.10.3</version>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -85,7 +85,7 @@ public class Spider implements Runnable, Task {
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
protected boolean exitWhenComplete = true;
protected volatile boolean exitWhenComplete = true;
protected final static int STAT_INIT = 0;
@ -598,6 +598,13 @@ public class Spider implements Runnable, Task {
}
}
/**
* Stop when all tasks in the queue are completed and all worker threads are also completed
*/
public void stopWhenComplete(){
this.exitWhenComplete = true;
}
/**
* start with more than one threads
*

View File

@ -103,8 +103,8 @@ public class HttpClientDownloader extends AbstractDownloader {
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
HttpEntity entity = httpResponse.getEntity();
byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];;
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];
String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null;
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()) {

View File

@ -21,6 +21,10 @@ public abstract class CharsetUtils {
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
private CharsetUtils() {
throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!");
}
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
String charset;
// charset

View File

@ -116,6 +116,10 @@ public class UrlUtils {
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
public static String getCharset(String contentType) {
if (contentType == null) {
return null;
}
Matcher matcher = patternForCharset.matcher(contentType);
if (matcher.find()) {
String charset = matcher.group(1);

View File

@ -0,0 +1,16 @@
package us.codecraft.webmagic.utils;
import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.IOException;
import org.junit.jupiter.api.Test;
class CharsetUtilsTest {
@Test
void testDetectCharset() throws IOException {
assertNull(CharsetUtils.detectCharset(null, new byte[0]));
}
}

View File

@ -1,5 +1,7 @@
package us.codecraft.webmagic.utils;
import static org.junit.Assert.assertNull;
import org.junit.Assert;
import org.junit.Test;
@ -43,5 +45,9 @@ public class UrlUtilsTest {
Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url));
}
@Test
public void testGetCharset() {
assertNull(UrlUtils.getCharset(null));
}
}

View File

@ -10,7 +10,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.10.3</version>
<version>1.0.0</version>
</parent>
<artifactId>webmagic-coverage</artifactId>

View File

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.10.3</version>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -13,7 +13,6 @@ import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public class ConfigurablePageProcessorTest {

View File

@ -12,7 +12,6 @@ import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* @date 14-4-4
*/
public class ModelPageProcessorTest {

View File

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.10.3</version>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.10.3</version>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.10.3</version>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.10.3</version>
<version>1.0.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>