Merge branch 'stable' of github.com:code4craft/webmagic
commit
8c33be48a6
|
@ -4,3 +4,6 @@ out/
|
||||||
.idea
|
.idea
|
||||||
.classpath
|
.classpath
|
||||||
.project
|
.project
|
||||||
|
.settings/
|
||||||
|
bin/
|
||||||
|
.myeclipse
|
||||||
|
|
|
@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
79
pom.xml
79
pom.xml
|
@ -6,7 +6,7 @@
|
||||||
<version>7</version>
|
<version>7</version>
|
||||||
</parent>
|
</parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -88,13 +88,25 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>xsoup</artifactId>
|
<artifactId>xsoup</artifactId>
|
||||||
<version>0.2.2</version>
|
<version>0.2.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.alibaba</groupId>
|
<groupId>com.alibaba</groupId>
|
||||||
<artifactId>fastjson</artifactId>
|
<artifactId>fastjson</artifactId>
|
||||||
<version>1.1.37</version>
|
<version>1.1.37</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
<artifactId>moco-core</artifactId>
|
||||||
|
<version>0.9.1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-simple</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>log4j</groupId>
|
<groupId>log4j</groupId>
|
||||||
<artifactId>log4j</artifactId>
|
<artifactId>log4j</artifactId>
|
||||||
|
@ -230,22 +242,44 @@
|
||||||
|
|
||||||
<profiles>
|
<profiles>
|
||||||
<profile>
|
<profile>
|
||||||
<id>release-sign-artifacts</id>
|
<id>release</id>
|
||||||
<activation>
|
|
||||||
<property>
|
|
||||||
<name>performRelease</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
</activation>
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
|
<!-- Source -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-source-plugin</artifactId>
|
||||||
|
<version>2.2.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>jar-no-fork</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<!-- Javadoc -->
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-javadoc-plugin</artifactId>
|
||||||
|
<version>2.9.1</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>jar</goal>
|
||||||
|
</goals>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
<!-- GPG -->
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-gpg-plugin</artifactId>
|
<artifactId>maven-gpg-plugin</artifactId>
|
||||||
<version>1.1</version>
|
<version>1.5</version>
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>sign-artifacts</id>
|
|
||||||
<phase>verify</phase>
|
<phase>verify</phase>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>sign</goal>
|
<goal>sign</goal>
|
||||||
|
@ -253,10 +287,29 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.sonatype.plugins</groupId>
|
||||||
|
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||||
|
<version>1.6</version>
|
||||||
|
<extensions>true</extensions>
|
||||||
|
<configuration>
|
||||||
|
<serverId>sonatype-nexus-staging</serverId>
|
||||||
|
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
||||||
|
<autoReleaseAfterClose>true</autoReleaseAfterClose>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
|
<distributionManagement>
|
||||||
|
<snapshotRepository>
|
||||||
|
<id>sonatype-nexus-snapshots</id>
|
||||||
|
<url>https://oss.sonatype.org/content/repositories/snapshots/</url>
|
||||||
|
</snapshotRepository>
|
||||||
|
<repository>
|
||||||
|
<id>sonatype-nexus-staging</id>
|
||||||
|
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
|
||||||
|
</repository>
|
||||||
|
</distributionManagement>
|
||||||
</profile>
|
</profile>
|
||||||
</profiles>
|
</profiles>
|
||||||
|
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1-SNAPSHOT</version>
|
<version>0.5.2-SNAPSHOT-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-avalon</artifactId>
|
<artifactId>webmagic-avalon</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1-SNAPSHOT</version>
|
<version>0.5.2-SNAPSHOT-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-avalon</artifactId>
|
<artifactId>webmagic-avalon</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1-SNAPSHOT</version>
|
<version>0.5.2-SNAPSHOT-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-avalon</artifactId>
|
<artifactId>webmagic-avalon</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1-SNAPSHOT</version>
|
<version>0.5.2-SNAPSHOT-SNAPSHOT</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -35,6 +35,11 @@
|
||||||
<artifactId>xsoup</artifactId>
|
<artifactId>xsoup</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.dreamhead</groupId>
|
||||||
|
<artifactId>moco-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
|
|
|
@ -18,6 +18,8 @@ public class Request implements Serializable {
|
||||||
private static final long serialVersionUID = 2062192774891352043L;
|
private static final long serialVersionUID = 2062192774891352043L;
|
||||||
|
|
||||||
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
|
||||||
|
public static final String STATUS_CODE = "statusCode";
|
||||||
|
public static final String PROXY = "proxy";
|
||||||
|
|
||||||
private String url;
|
private String url;
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,8 @@ package us.codecraft.webmagic;
|
||||||
import com.google.common.collect.HashBasedTable;
|
import com.google.common.collect.HashBasedTable;
|
||||||
import com.google.common.collect.Table;
|
import com.google.common.collect.Table;
|
||||||
import org.apache.http.HttpHost;
|
import org.apache.http.HttpHost;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.proxy.ProxyPool;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -47,6 +49,8 @@ public class Site {
|
||||||
|
|
||||||
private HttpHost httpProxy;
|
private HttpHost httpProxy;
|
||||||
|
|
||||||
|
private ProxyPool httpProxyPool=new ProxyPool();
|
||||||
|
|
||||||
private boolean useGzip = true;
|
private boolean useGzip = true;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -438,4 +442,32 @@ public class Site {
|
||||||
", headers=" + headers +
|
", headers=" + headers +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
||||||
|
*
|
||||||
|
* @return this
|
||||||
|
*/
|
||||||
|
public Site setHttpProxyPool(List<String[]> httpProxyList) {
|
||||||
|
this.httpProxyPool=new ProxyPool(httpProxyList);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ProxyPool getHttpProxyPool() {
|
||||||
|
return httpProxyPool;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpHost getHttpProxyFromPool() {
|
||||||
|
return httpProxyPool.getProxy();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
|
||||||
|
httpProxyPool.returnProxy(proxy,statusCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Site setProxyReuseInterval(int reuseInterval) {
|
||||||
|
this.httpProxyPool.setReuseInterval(reuseInterval);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@ package us.codecraft.webmagic;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.downloader.Downloader;
|
import us.codecraft.webmagic.downloader.Downloader;
|
||||||
|
@ -324,6 +325,10 @@ public class Spider implements Runnable, Task {
|
||||||
onError(requestFinal);
|
onError(requestFinal);
|
||||||
logger.error("process request " + requestFinal + " error", e);
|
logger.error("process request " + requestFinal + " error", e);
|
||||||
} finally {
|
} finally {
|
||||||
|
if (site.getHttpProxyPool().isEnable()) {
|
||||||
|
site.returnHttpProxyToPool((HttpHost) requestFinal.getExtra(Request.PROXY), (Integer) requestFinal
|
||||||
|
.getExtra(Request.STATUS_CODE));
|
||||||
|
}
|
||||||
pageCount.incrementAndGet();
|
pageCount.incrementAndGet();
|
||||||
signalNewUrl();
|
signalNewUrl();
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
import org.apache.http.HttpResponse;
|
import org.apache.http.HttpResponse;
|
||||||
import org.apache.http.NameValuePair;
|
import org.apache.http.NameValuePair;
|
||||||
import org.apache.http.annotation.ThreadSafe;
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
|
@ -12,17 +14,22 @@ import org.apache.http.client.methods.HttpUriRequest;
|
||||||
import org.apache.http.client.methods.RequestBuilder;
|
import org.apache.http.client.methods.RequestBuilder;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.util.EntityUtils;
|
import org.apache.http.util.EntityUtils;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Site;
|
import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -79,16 +86,13 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
logger.info("downloading page {}", request.getUrl());
|
logger.info("downloading page {}", request.getUrl());
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
|
int statusCode=0;
|
||||||
try {
|
try {
|
||||||
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
|
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
|
||||||
httpResponse = getHttpClient(site).execute(httpUriRequest);
|
httpResponse = getHttpClient(site).execute(httpUriRequest);
|
||||||
int statusCode = httpResponse.getStatusLine().getStatusCode();
|
statusCode = httpResponse.getStatusLine().getStatusCode();
|
||||||
|
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||||
if (statusAccept(acceptStatCode, statusCode)) {
|
if (statusAccept(acceptStatCode, statusCode)) {
|
||||||
//charset
|
|
||||||
if (charset == null) {
|
|
||||||
String value = httpResponse.getEntity().getContentType().getValue();
|
|
||||||
charset = UrlUtils.getCharset(value);
|
|
||||||
}
|
|
||||||
Page page = handleResponse(request, charset, httpResponse, task);
|
Page page = handleResponse(request, charset, httpResponse, task);
|
||||||
onSuccess(request);
|
onSuccess(request);
|
||||||
return page;
|
return page;
|
||||||
|
@ -104,6 +108,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
onError(request);
|
onError(request);
|
||||||
return null;
|
return null;
|
||||||
} finally {
|
} finally {
|
||||||
|
request.putExtra(Request.STATUS_CODE, statusCode);
|
||||||
try {
|
try {
|
||||||
if (httpResponse != null) {
|
if (httpResponse != null) {
|
||||||
//ensure the connection is released back to pool
|
//ensure the connection is released back to pool
|
||||||
|
@ -136,8 +141,10 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
.setSocketTimeout(site.getTimeOut())
|
.setSocketTimeout(site.getTimeOut())
|
||||||
.setConnectTimeout(site.getTimeOut())
|
.setConnectTimeout(site.getTimeOut())
|
||||||
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
.setCookieSpec(CookieSpecs.BEST_MATCH);
|
||||||
if (site != null && site.getHttpProxy() != null) {
|
if (site.getHttpProxyPool().isEnable()) {
|
||||||
requestConfigBuilder.setProxy(site.getHttpProxy());
|
HttpHost host = site.getHttpProxyFromPool();
|
||||||
|
requestConfigBuilder.setProxy(host);
|
||||||
|
request.putExtra(Request.PROXY, host);
|
||||||
}
|
}
|
||||||
requestBuilder.setConfig(requestConfigBuilder.build());
|
requestBuilder.setConfig(requestConfigBuilder.build());
|
||||||
return requestBuilder.build();
|
return requestBuilder.build();
|
||||||
|
@ -168,7 +175,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
|
||||||
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
String content = getContent(charset, httpResponse);
|
||||||
Page page = new Page();
|
Page page = new Page();
|
||||||
page.setRawText(content);
|
page.setRawText(content);
|
||||||
page.setUrl(new PlainText(request.getUrl()));
|
page.setUrl(new PlainText(request.getUrl()));
|
||||||
|
@ -176,4 +183,57 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
|
||||||
|
if (charset == null) {
|
||||||
|
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||||
|
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
|
||||||
|
if (htmlCharset != null) {
|
||||||
|
return new String(contentBytes, htmlCharset);
|
||||||
|
} else {
|
||||||
|
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
|
||||||
|
return new String(contentBytes);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
|
||||||
|
String charset;
|
||||||
|
// charset
|
||||||
|
// 1、encoding in http header Content-Type
|
||||||
|
String value = httpResponse.getEntity().getContentType().getValue();
|
||||||
|
charset = UrlUtils.getCharset(value);
|
||||||
|
if (StringUtils.isNotBlank(charset)) {
|
||||||
|
logger.debug("Auto get charset: {}", charset);
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
// use default charset to decode first time
|
||||||
|
Charset defaultCharset = Charset.defaultCharset();
|
||||||
|
String content = new String(contentBytes, defaultCharset.name());
|
||||||
|
// 2、charset in meta
|
||||||
|
if (StringUtils.isNotEmpty(content)) {
|
||||||
|
Document document = Jsoup.parse(content);
|
||||||
|
Elements links = document.select("meta");
|
||||||
|
for (Element link : links) {
|
||||||
|
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
String metaContent = link.attr("content");
|
||||||
|
String metaCharset = link.attr("charset");
|
||||||
|
if (metaContent.indexOf("charset") != -1) {
|
||||||
|
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
|
||||||
|
charset = metaContent.split("=")[1];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// 2.2、html5 <meta charset="UTF-8" />
|
||||||
|
else if (StringUtils.isNotEmpty(metaCharset)) {
|
||||||
|
charset = metaCharset;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.debug("Auto get charset: {}", charset);
|
||||||
|
// 3、todo use tools as cpdetector for content decode
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,12 +4,14 @@ import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.http.annotation.ThreadSafe;
|
import org.apache.http.annotation.ThreadSafe;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import us.codecraft.webmagic.ResultItems;
|
import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||||
|
|
||||||
import java.io.FileWriter;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -39,7 +41,7 @@ public class FilePipeline extends FilePersistentBase implements Pipeline {
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
|
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
|
||||||
try {
|
try {
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
|
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
|
||||||
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||||
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
if (entry.getValue() instanceof Iterable) {
|
if (entry.getValue() instanceof Iterable) {
|
||||||
|
|
|
@ -0,0 +1,172 @@
|
||||||
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.Delayed;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* >>>>Proxy Status
|
||||||
|
+----------+ +-----+
|
||||||
|
| last use | | new |
|
||||||
|
+-----+----+ +---+-+
|
||||||
|
| +------+ |
|
||||||
|
+->| init |<--+
|
||||||
|
+--+---+
|
||||||
|
|
|
||||||
|
v
|
||||||
|
+--------+
|
||||||
|
+--->| borrow |
|
||||||
|
| +---+----+
|
||||||
|
| |+------------------+
|
||||||
|
| v
|
||||||
|
| +--------+
|
||||||
|
| | in use | Respone Time
|
||||||
|
| +---+----+
|
||||||
|
| |+------------------+
|
||||||
|
| v
|
||||||
|
| +--------+
|
||||||
|
| | return |
|
||||||
|
| +---+----+
|
||||||
|
| |+-------------------+
|
||||||
|
| v
|
||||||
|
| +-------+ reuse interval
|
||||||
|
| | delay | (delay time)
|
||||||
|
| +---+---+
|
||||||
|
| |+-------------------+
|
||||||
|
| v
|
||||||
|
| +------+
|
||||||
|
| | idle | idle time
|
||||||
|
| +---+--+
|
||||||
|
| |+-------------------+
|
||||||
|
+--------+
|
||||||
|
*/
|
||||||
|
public class Proxy implements Delayed, Serializable {
|
||||||
|
|
||||||
|
private static final long serialVersionUID = 228939737383625551L;
|
||||||
|
public static final int ERROR_403 = 403;
|
||||||
|
public static final int ERROR_404 = 404;
|
||||||
|
public static final int ERROR_BANNED = 10000;
|
||||||
|
public static final int ERROR_Proxy = 10001;
|
||||||
|
public static final int SUCCESS = 200;
|
||||||
|
|
||||||
|
private final HttpHost httpHost;
|
||||||
|
|
||||||
|
private int reuseTimeInterval = 1500;// ms
|
||||||
|
private Long canReuseTime = 0L;
|
||||||
|
private Long lastBorrowTime = System.currentTimeMillis();
|
||||||
|
private Long responseTime = 0L;
|
||||||
|
private Long idleTime = 0L;
|
||||||
|
|
||||||
|
private int failedNum = 0;
|
||||||
|
private int successNum = 0;
|
||||||
|
private int borrowNum = 0;
|
||||||
|
|
||||||
|
private List<Integer> failedErrorType = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
Proxy(HttpHost httpHost) {
|
||||||
|
this.httpHost = httpHost;
|
||||||
|
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
Proxy(HttpHost httpHost, int reuseInterval) {
|
||||||
|
this.httpHost = httpHost;
|
||||||
|
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getSuccessNum() {
|
||||||
|
return successNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void successNumIncrement(int increment) {
|
||||||
|
this.successNum += increment;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long getLastUseTime() {
|
||||||
|
return lastBorrowTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLastBorrowTime(Long lastBorrowTime) {
|
||||||
|
this.lastBorrowTime = lastBorrowTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void recordResponse() {
|
||||||
|
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
|
||||||
|
this.lastBorrowTime = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Integer> getFailedErrorType() {
|
||||||
|
return failedErrorType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFailedErrorType(List<Integer> failedErrorType) {
|
||||||
|
this.failedErrorType = failedErrorType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void fail(int failedErrorType) {
|
||||||
|
this.failedNum++;
|
||||||
|
this.failedErrorType.add(failedErrorType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFailedNum(int failedNum) {
|
||||||
|
this.failedNum = failedNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getFailedNum() {
|
||||||
|
return failedNum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFailedType() {
|
||||||
|
String re = "";
|
||||||
|
for (Integer i : this.failedErrorType) {
|
||||||
|
re += i + " . ";
|
||||||
|
}
|
||||||
|
return re;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpHost getHttpHost() {
|
||||||
|
return httpHost;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReuseTimeInterval() {
|
||||||
|
return reuseTimeInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReuseTimeInterval(int reuseTimeInterval) {
|
||||||
|
this.reuseTimeInterval = reuseTimeInterval;
|
||||||
|
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getDelay(TimeUnit unit) {
|
||||||
|
return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(Delayed o) {
|
||||||
|
Proxy that = (Proxy) o;
|
||||||
|
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime,
|
||||||
|
successNum * 100.0 / borrowNum, borrowNum);
|
||||||
|
return re;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void borrowNumIncrement(int increment) {
|
||||||
|
this.borrowNum += increment;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBorrowNum() {
|
||||||
|
return borrowNum;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,290 @@
|
||||||
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.net.InetAddress;
|
||||||
|
import java.net.UnknownHostException;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.util.concurrent.BlockingQueue;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.DelayQueue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ClassName:ProxyPool
|
||||||
|
*
|
||||||
|
* @see
|
||||||
|
* @Function: TODO ADD FUNCTION
|
||||||
|
* @author ch
|
||||||
|
* @version Ver 1.0
|
||||||
|
* @Date 2014-2-14 下午01:10:04
|
||||||
|
*/
|
||||||
|
public class ProxyPool {
|
||||||
|
|
||||||
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>();
|
||||||
|
private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>();
|
||||||
|
|
||||||
|
private int reuseInterval = 1500;// ms
|
||||||
|
private int reviveTime = 2 * 60 * 60 * 1000;// ms
|
||||||
|
|
||||||
|
private boolean isEnable = false;
|
||||||
|
private boolean validateWhenInit = false;
|
||||||
|
private String proxyFile = "data/lastUse.proxy";
|
||||||
|
|
||||||
|
private Timer timer = new Timer(true);
|
||||||
|
private TimerTask saveProxyTask = new TimerTask() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
saveProxyList();
|
||||||
|
logger.info(allProxyStatus());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public ProxyPool() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ProxyPool(List<String[]> httpProxyList) {
|
||||||
|
readProxyList();
|
||||||
|
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
|
||||||
|
timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void saveProxyList() {
|
||||||
|
if (allProxy.size() == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile));
|
||||||
|
os.writeObject(prepareForSaving());
|
||||||
|
os.close();
|
||||||
|
logger.info("save proxy");
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("proxy file not found", e);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, Proxy> prepareForSaving() {
|
||||||
|
Map<String, Proxy> tmp = new HashMap<String, Proxy>();
|
||||||
|
for (Entry<String, Proxy> e : allProxy.entrySet()) {
|
||||||
|
Proxy p = e.getValue();
|
||||||
|
p.setFailedNum(0);
|
||||||
|
tmp.put(e.getKey(), p);
|
||||||
|
}
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readProxyList() {
|
||||||
|
try {
|
||||||
|
ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile));
|
||||||
|
addProxy((Map<String, Proxy>) is.readObject());
|
||||||
|
is.close();
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
logger.error("proxy file not found", e);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (ClassNotFoundException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addProxy(Map<String, Proxy> httpProxyMap) {
|
||||||
|
isEnable = true;
|
||||||
|
for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
|
||||||
|
try {
|
||||||
|
if (allProxy.containsKey(entry.getKey())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) {
|
||||||
|
entry.getValue().setFailedNum(0);
|
||||||
|
entry.getValue().setReuseTimeInterval(reuseInterval);
|
||||||
|
proxyQueue.add(entry.getValue());
|
||||||
|
allProxy.put(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
logger.error("HttpHost init error:", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.info("proxy pool size>>>>" + allProxy.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addProxy(String[]... httpProxyList) {
|
||||||
|
isEnable = true;
|
||||||
|
for (String[] s : httpProxyList) {
|
||||||
|
try {
|
||||||
|
if (allProxy.containsKey(s[0])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
|
||||||
|
if (!validateWhenInit || ProxyUtil.validateProxy(item)) {
|
||||||
|
Proxy p = new Proxy(item, reuseInterval);
|
||||||
|
proxyQueue.add(p);
|
||||||
|
allProxy.put(s[0], p);
|
||||||
|
}
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
logger.error("HttpHost init error:", e);
|
||||||
|
} catch (UnknownHostException e) {
|
||||||
|
logger.error("HttpHost init error:", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.info("proxy pool size>>>>" + allProxy.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpHost getProxy() {
|
||||||
|
Proxy proxy = null;
|
||||||
|
try {
|
||||||
|
Long time = System.currentTimeMillis();
|
||||||
|
proxy = proxyQueue.take();
|
||||||
|
double costTime = (System.currentTimeMillis() - time) / 1000.0;
|
||||||
|
if (costTime > reuseInterval) {
|
||||||
|
logger.info("get proxy time >>>> " + costTime);
|
||||||
|
}
|
||||||
|
Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
|
||||||
|
p.setLastBorrowTime(System.currentTimeMillis());
|
||||||
|
p.borrowNumIncrement(1);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
logger.error("get proxy error", e);
|
||||||
|
}
|
||||||
|
if (proxy == null) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
return proxy.getHttpHost();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void returnProxy(HttpHost host, int statusCode) {
|
||||||
|
Proxy p = allProxy.get(host.getAddress().getHostAddress());
|
||||||
|
if (p == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
switch (statusCode) {
|
||||||
|
case Proxy.SUCCESS:
|
||||||
|
p.setReuseTimeInterval(reuseInterval);
|
||||||
|
p.setFailedNum(0);
|
||||||
|
p.setFailedErrorType(new ArrayList<Integer>());
|
||||||
|
p.recordResponse();
|
||||||
|
p.successNumIncrement(1);
|
||||||
|
break;
|
||||||
|
case Proxy.ERROR_403:
|
||||||
|
// banned,try larger interval
|
||||||
|
p.fail(Proxy.ERROR_403);
|
||||||
|
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||||
|
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||||
|
break;
|
||||||
|
case Proxy.ERROR_BANNED:
|
||||||
|
p.fail(Proxy.ERROR_BANNED);
|
||||||
|
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
|
||||||
|
logger.warn("this proxy is banned >>>> " + p.getHttpHost());
|
||||||
|
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
|
||||||
|
break;
|
||||||
|
case Proxy.ERROR_404:
|
||||||
|
//p.fail(Proxy.ERROR_404);
|
||||||
|
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
p.fail(statusCode);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (p.getFailedNum() > 20) {
|
||||||
|
// allProxy.remove(host.getAddress().getHostAddress());
|
||||||
|
p.setReuseTimeInterval(reviveTime);
|
||||||
|
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (p.getFailedNum()%5==0) {
|
||||||
|
if (!ProxyUtil.validateProxy(host)) {
|
||||||
|
// allProxy.remove(host.getAddress().getHostAddress());
|
||||||
|
p.setReuseTimeInterval(reviveTime);
|
||||||
|
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
proxyQueue.put(p);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
logger.warn("proxyQueue return proxy error", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String allProxyStatus() {
|
||||||
|
String re = "all proxy info >>>> \n";
|
||||||
|
for (Entry<String, Proxy> entry : allProxy.entrySet()) {
|
||||||
|
re += entry.getValue().toString() + "\n";
|
||||||
|
}
|
||||||
|
return re;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getIdleNum() {
|
||||||
|
return proxyQueue.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReuseInterval() {
|
||||||
|
return reuseInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReuseInterval(int reuseInterval) {
|
||||||
|
this.reuseInterval = reuseInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<String[]> getProxyList() {
|
||||||
|
List<String[]> proxyList = new ArrayList<String[]>();
|
||||||
|
BufferedReader br = null;
|
||||||
|
try {
|
||||||
|
br = new BufferedReader(new FileReader(new File("proxy.txt")));
|
||||||
|
|
||||||
|
String line = "";
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return proxyList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
ProxyPool proxyPool = new ProxyPool(getProxyList());
|
||||||
|
proxyPool.setReuseInterval(10000);
|
||||||
|
// proxyPool.saveProxyList();
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
List<HttpHost> httphostList = new ArrayList<HttpHost>();
|
||||||
|
System.in.read();
|
||||||
|
int i = 0;
|
||||||
|
while (proxyPool.getIdleNum() > 2) {
|
||||||
|
HttpHost httphost = proxyPool.getProxy();
|
||||||
|
httphostList.add(httphost);
|
||||||
|
// proxyPool.proxyPool.use(httphost);
|
||||||
|
proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString());
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
System.out.println(proxyPool.allProxyStatus());
|
||||||
|
System.in.read();
|
||||||
|
for (i = 0; i < httphostList.size(); i++) {
|
||||||
|
proxyPool.returnProxy(httphostList.get(i), 200);
|
||||||
|
proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString());
|
||||||
|
}
|
||||||
|
System.out.println(proxyPool.allProxyStatus());
|
||||||
|
System.in.read();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void enable(boolean isEnable) {
|
||||||
|
this.isEnable = isEnable;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnable() {
|
||||||
|
return isEnable;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,101 @@
|
||||||
|
package us.codecraft.webmagic.proxy;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.Inet6Address;
|
||||||
|
import java.net.InetAddress;
|
||||||
|
import java.net.InetSocketAddress;
|
||||||
|
import java.net.NetworkInterface;
|
||||||
|
import java.net.Socket;
|
||||||
|
import java.net.SocketException;
|
||||||
|
import java.util.Enumeration;
|
||||||
|
|
||||||
|
import org.apache.http.HttpHost;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ClassName:ProxyUtil
|
||||||
|
*
|
||||||
|
* @see
|
||||||
|
* @author ch
|
||||||
|
* @version Ver 1.0
|
||||||
|
* @Date 2014-2-16 下午04:20:07
|
||||||
|
*/
|
||||||
|
public class ProxyUtil {
|
||||||
|
// TODO 改为单例
|
||||||
|
private static InetAddress localAddr;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class);
|
||||||
|
static {
|
||||||
|
init();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void init() {
|
||||||
|
Enumeration<InetAddress> localAddrs;
|
||||||
|
try {
|
||||||
|
NetworkInterface ni = NetworkInterface.getByName("eth7");
|
||||||
|
if (ni == null) {
|
||||||
|
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
||||||
|
}
|
||||||
|
localAddrs = ni.getInetAddresses();
|
||||||
|
while (localAddrs.hasMoreElements()) {
|
||||||
|
InetAddress tmp = localAddrs.nextElement();
|
||||||
|
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
|
||||||
|
localAddr = tmp;
|
||||||
|
logger.info("local IP:" + localAddr.getHostAddress());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Failure when init ProxyUtil", e);
|
||||||
|
logger.error("choose NetworkInterface\n" + getNetworkInterface());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean validateProxy(HttpHost p) {
|
||||||
|
if (localAddr == null) {
|
||||||
|
logger.error("cannot get local ip");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
boolean isReachable = false;
|
||||||
|
Socket socket = null;
|
||||||
|
try {
|
||||||
|
socket = new Socket();
|
||||||
|
socket.bind(new InetSocketAddress(localAddr, 0));
|
||||||
|
InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort());
|
||||||
|
socket.connect(endpointSocketAddr, 3000);
|
||||||
|
logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p);
|
||||||
|
isReachable = true;
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p);
|
||||||
|
} finally {
|
||||||
|
if (socket != null) {
|
||||||
|
try {
|
||||||
|
socket.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.warn("Error occurred while closing socket of validating proxy", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isReachable;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getNetworkInterface() {
|
||||||
|
String networkInterfaceName = "";
|
||||||
|
Enumeration<NetworkInterface> enumeration = null;
|
||||||
|
try {
|
||||||
|
enumeration = NetworkInterface.getNetworkInterfaces();
|
||||||
|
} catch (SocketException e1) {
|
||||||
|
e1.printStackTrace();
|
||||||
|
}
|
||||||
|
while (enumeration.hasMoreElements()) {
|
||||||
|
NetworkInterface networkInterface = enumeration.nextElement();
|
||||||
|
networkInterfaceName += networkInterface.toString() + '\n';
|
||||||
|
Enumeration<InetAddress> addr = networkInterface.getInetAddresses();
|
||||||
|
while (addr.hasMoreElements()) {
|
||||||
|
networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return networkInterfaceName;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,109 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* @since 0.5.2
|
||||||
|
*/
|
||||||
|
public abstract class AbstractSelectable implements Selectable {
|
||||||
|
|
||||||
|
protected abstract List<String> getSourceTexts();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable css(String selector) {
|
||||||
|
return $(selector);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable css(String selector, String attrName) {
|
||||||
|
return $(selector, attrName);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Selectable select(Selector selector, List<String> strings) {
|
||||||
|
List<String> results = new ArrayList<String>();
|
||||||
|
for (String string : strings) {
|
||||||
|
String result = selector.select(string);
|
||||||
|
if (result != null) {
|
||||||
|
results.add(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new PlainText(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Selectable selectList(Selector selector, List<String> strings) {
|
||||||
|
List<String> results = new ArrayList<String>();
|
||||||
|
for (String string : strings) {
|
||||||
|
List<String> result = selector.selectList(string);
|
||||||
|
results.addAll(result);
|
||||||
|
}
|
||||||
|
return new PlainText(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> all() {
|
||||||
|
return getSourceTexts();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable jsonPath(String jsonPath) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String get() {
|
||||||
|
if (CollectionUtils.isNotEmpty(all())) {
|
||||||
|
return all().get(0);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable select(Selector selector) {
|
||||||
|
return select(selector, getSourceTexts());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable selectList(Selector selector) {
|
||||||
|
return selectList(selector, getSourceTexts());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable regex(String regex) {
|
||||||
|
RegexSelector regexSelector = Selectors.regex(regex);
|
||||||
|
return selectList(regexSelector, getSourceTexts());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable regex(String regex, int group) {
|
||||||
|
RegexSelector regexSelector = Selectors.regex(regex, group);
|
||||||
|
return selectList(regexSelector, getSourceTexts());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable replace(String regex, String replacement) {
|
||||||
|
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
|
||||||
|
return select(replaceSelector, getSourceTexts());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFirstSourceText() {
|
||||||
|
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
|
||||||
|
return getSourceTexts().get(0);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean match() {
|
||||||
|
return getSourceTexts() != null && getSourceTexts().size() > 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -28,4 +29,25 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Element selectElement(String text) {
|
||||||
|
if (text != null) {
|
||||||
|
return selectElement(Jsoup.parse(text));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Element> selectElements(String text) {
|
||||||
|
if (text != null) {
|
||||||
|
return selectElements(Jsoup.parse(text));
|
||||||
|
} else {
|
||||||
|
return new ArrayList<Element>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract Element selectElement(Element element);
|
||||||
|
|
||||||
|
public abstract List<Element> selectElements(Element element);
|
||||||
|
|
||||||
|
public abstract boolean hasAttribute();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String select(Element element) {
|
public String select(Element element) {
|
||||||
Elements elements = element.select(selectorText);
|
List<Element> elements = selectElements(element);
|
||||||
if (CollectionUtils.isEmpty(elements)) {
|
if (CollectionUtils.isEmpty(elements)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
|
||||||
@Override
|
@Override
|
||||||
public List<String> selectList(Element doc) {
|
public List<String> selectList(Element doc) {
|
||||||
List<String> strings = new ArrayList<String>();
|
List<String> strings = new ArrayList<String>();
|
||||||
Elements elements = doc.select(selectorText);
|
List<Element> elements = selectElements(doc);
|
||||||
if (CollectionUtils.isNotEmpty(elements)) {
|
if (CollectionUtils.isNotEmpty(elements)) {
|
||||||
for (Element element : elements) {
|
for (Element element : elements) {
|
||||||
String value = getValue(element);
|
String value = getValue(element);
|
||||||
|
@ -78,4 +78,23 @@ public class CssSelector extends BaseElementSelector {
|
||||||
}
|
}
|
||||||
return strings;
|
return strings;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Element selectElement(Element element) {
|
||||||
|
Elements elements = element.select(selectorText);
|
||||||
|
if (CollectionUtils.isNotEmpty(elements)) {
|
||||||
|
return elements.get(0);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Element> selectElements(Element element) {
|
||||||
|
return element.select(selectorText);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasAttribute() {
|
||||||
|
return attrName != null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -14,7 +15,7 @@ import java.util.List;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class Html extends PlainText {
|
public class Html extends HtmlNode {
|
||||||
|
|
||||||
private Logger logger = LoggerFactory.getLogger(getClass());
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
@ -23,123 +24,26 @@ public class Html extends PlainText {
|
||||||
*/
|
*/
|
||||||
private Document document;
|
private Document document;
|
||||||
|
|
||||||
private boolean needInitCache = true;
|
|
||||||
|
|
||||||
public Html(List<String> strings) {
|
|
||||||
super(strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Html(String text) {
|
public Html(String text) {
|
||||||
super(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Html(List<String> strings, boolean needInitCache) {
|
|
||||||
super(strings);
|
|
||||||
this.needInitCache = needInitCache;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Html(String text, boolean needInitCache) {
|
|
||||||
super(text);
|
|
||||||
this.needInitCache = needInitCache;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* lazy init
|
|
||||||
*/
|
|
||||||
private void initDocument() {
|
|
||||||
if (this.document == null && needInitCache) {
|
|
||||||
needInitCache = false;
|
|
||||||
//just init once whether the parsing succeeds or not
|
|
||||||
try {
|
try {
|
||||||
this.document = Jsoup.parse(getText());
|
this.document = Jsoup.parse(text);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
this.document = null;
|
||||||
logger.warn("parse document error ", e);
|
logger.warn("parse document error ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public Html(Document document) {
|
public Html(Document document) {
|
||||||
super(document.html());
|
|
||||||
this.document = document;
|
this.document = document;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Html create(String text) {
|
|
||||||
return new Html(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Selectable select(Selector selector, List<String> strings) {
|
|
||||||
initDocument();
|
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
for (String string : strings) {
|
|
||||||
String result = selector.select(string);
|
|
||||||
if (result != null) {
|
|
||||||
results.add(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new Html(results, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Selectable selectList(Selector selector, List<String> strings) {
|
|
||||||
initDocument();
|
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
for (String string : strings) {
|
|
||||||
List<String> result = selector.selectList(string);
|
|
||||||
results.addAll(result);
|
|
||||||
}
|
|
||||||
return new Html(results, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable smartContent() {
|
|
||||||
initDocument();
|
|
||||||
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
|
||||||
return select(smartContentSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable links() {
|
|
||||||
return xpath("//a/@href");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable xpath(String xpath) {
|
|
||||||
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
|
||||||
if (document != null) {
|
|
||||||
return new Html(xpathSelector.selectList(document), false);
|
|
||||||
}
|
|
||||||
return selectList(xpathSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable $(String selector) {
|
|
||||||
CssSelector cssSelector = Selectors.$(selector);
|
|
||||||
if (document != null) {
|
|
||||||
return new Html(cssSelector.selectList(document), false);
|
|
||||||
}
|
|
||||||
return selectList(cssSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable $(String selector, String attrName) {
|
|
||||||
CssSelector cssSelector = Selectors.$(selector, attrName);
|
|
||||||
if (document != null) {
|
|
||||||
return new Html(cssSelector.selectList(document), false);
|
|
||||||
}
|
|
||||||
return selectList(cssSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Document getDocument() {
|
public Document getDocument() {
|
||||||
initDocument();
|
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getText() {
|
@Override
|
||||||
if (strings != null && strings.size() > 0) {
|
protected List<Element> getElements() {
|
||||||
return strings.get(0);
|
return Collections.<Element>singletonList(getDocument());
|
||||||
}
|
|
||||||
return document.html();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -151,7 +55,7 @@ public class Html extends PlainText {
|
||||||
ElementSelector elementSelector = (ElementSelector) selector;
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
return elementSelector.select(getDocument());
|
return elementSelector.select(getDocument());
|
||||||
} else {
|
} else {
|
||||||
return selector.select(getText());
|
return selector.select(getFirstSourceText());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -160,7 +64,12 @@ public class Html extends PlainText {
|
||||||
ElementSelector elementSelector = (ElementSelector) selector;
|
ElementSelector elementSelector = (ElementSelector) selector;
|
||||||
return elementSelector.selectList(getDocument());
|
return elementSelector.selectList(getDocument());
|
||||||
} else {
|
} else {
|
||||||
return selector.selectList(getText());
|
return selector.selectList(getFirstSourceText());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Html create(String text) {
|
||||||
|
return new Html(text);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,125 @@
|
||||||
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class HtmlNode extends AbstractSelectable {
|
||||||
|
|
||||||
|
private final List<Element> elements;
|
||||||
|
|
||||||
|
public HtmlNode(List<Element> elements) {
|
||||||
|
this.elements = elements;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HtmlNode() {
|
||||||
|
elements = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<Element> getElements() {
|
||||||
|
return elements;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable smartContent() {
|
||||||
|
SmartContentSelector smartContentSelector = Selectors.smartContent();
|
||||||
|
return select(smartContentSelector, getSourceTexts());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable links() {
|
||||||
|
return xpath("//a/@href");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable xpath(String xpath) {
|
||||||
|
XpathSelector xpathSelector = Selectors.xpath(xpath);
|
||||||
|
return selectElements(xpathSelector);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* select elements
|
||||||
|
*
|
||||||
|
* @param elementSelector
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
protected Selectable selectElements(BaseElementSelector elementSelector) {
|
||||||
|
ListIterator<Element> elementIterator = getElements().listIterator();
|
||||||
|
if (!elementSelector.hasAttribute()) {
|
||||||
|
List<Element> resultElements = new ArrayList<Element>();
|
||||||
|
while (elementIterator.hasNext()) {
|
||||||
|
Element element = checkElementAndConvert(elementIterator);
|
||||||
|
List<Element> selectElements = elementSelector.selectElements(element);
|
||||||
|
resultElements.addAll(selectElements);
|
||||||
|
}
|
||||||
|
return new HtmlNode(resultElements);
|
||||||
|
} else {
|
||||||
|
// has attribute, consider as plaintext
|
||||||
|
List<String> resultStrings = new ArrayList<String>();
|
||||||
|
while (elementIterator.hasNext()) {
|
||||||
|
Element element = checkElementAndConvert(elementIterator);
|
||||||
|
List<String> selectList = elementSelector.selectList(element);
|
||||||
|
resultStrings.addAll(selectList);
|
||||||
|
}
|
||||||
|
return new PlainText(resultStrings);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Only document can be select
|
||||||
|
* See: https://github.com/code4craft/webmagic/issues/113
|
||||||
|
*
|
||||||
|
* @param elementIterator
|
||||||
|
* @param element
|
||||||
|
*/
|
||||||
|
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
|
||||||
|
Element element = elementIterator.next();
|
||||||
|
if (!(element instanceof Document)) {
|
||||||
|
Document root = new Document(element.ownerDocument().baseUri());
|
||||||
|
Element clone = element.clone();
|
||||||
|
root.appendChild(clone);
|
||||||
|
elementIterator.set(root);
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable $(String selector) {
|
||||||
|
CssSelector cssSelector = Selectors.$(selector);
|
||||||
|
return selectElements(cssSelector);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Selectable $(String selector, String attrName) {
|
||||||
|
CssSelector cssSelector = Selectors.$(selector, attrName);
|
||||||
|
return selectElements(cssSelector);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Selectable> nodes() {
|
||||||
|
List<Selectable> selectables = new ArrayList<Selectable>();
|
||||||
|
for (Element element : getElements()) {
|
||||||
|
List<Element> childElements = new ArrayList<Element>(1);
|
||||||
|
childElements.add(element);
|
||||||
|
selectables.add(new HtmlNode(childElements));
|
||||||
|
}
|
||||||
|
return selectables;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<String> getSourceTexts() {
|
||||||
|
List<String> sourceTexts = new ArrayList<String>(getElements().size());
|
||||||
|
for (Element element : getElements()) {
|
||||||
|
sourceTexts.add(element.toString());
|
||||||
|
}
|
||||||
|
return sourceTexts;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,7 +1,7 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import com.alibaba.fastjson.JSON;
|
import com.alibaba.fastjson.JSON;
|
||||||
import org.jsoup.parser.TokenQueue;
|
import us.codecraft.xsoup.XTokenQueue;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -26,39 +26,32 @@ public class Json extends PlainText {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Json removePadding(String padding) {
|
public Json removePadding(String padding) {
|
||||||
String text = getText();
|
String text = getFirstSourceText();
|
||||||
TokenQueue tokenQueue = new TokenQueue(text);
|
XTokenQueue tokenQueue = new XTokenQueue(text);
|
||||||
tokenQueue.consumeWhitespace();
|
tokenQueue.consumeWhitespace();
|
||||||
tokenQueue.consume(padding);
|
tokenQueue.consume(padding);
|
||||||
tokenQueue.consumeWhitespace();
|
tokenQueue.consumeWhitespace();
|
||||||
String chompBalanced = tokenQueue.chompBalanced('(', ')');
|
String chompBalanced = tokenQueue.chompBalancedNotInQuotes('(', ')');
|
||||||
return new Json(chompBalanced);
|
return new Json(chompBalanced);
|
||||||
}
|
}
|
||||||
|
|
||||||
public <T> T toObject(Class<T> clazz) {
|
public <T> T toObject(Class<T> clazz) {
|
||||||
if (getText() == null) {
|
if (getFirstSourceText() == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return JSON.parseObject(getText(), clazz);
|
return JSON.parseObject(getFirstSourceText(), clazz);
|
||||||
}
|
}
|
||||||
|
|
||||||
public <T> List<T> toList(Class<T> clazz) {
|
public <T> List<T> toList(Class<T> clazz) {
|
||||||
if (getText() == null) {
|
if (getFirstSourceText() == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return JSON.parseArray(getText(), clazz);
|
return JSON.parseArray(getFirstSourceText(), clazz);
|
||||||
}
|
|
||||||
|
|
||||||
public String getText() {
|
|
||||||
if (strings != null && strings.size() > 0) {
|
|
||||||
return strings.get(0);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable jsonPath(String jsonPath) {
|
public Selectable jsonPath(String jsonPath) {
|
||||||
JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath);
|
JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath);
|
||||||
return selectList(jsonPathSelector,strings);
|
return selectList(jsonPathSelector,getSourceTexts());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,9 +46,12 @@ public class JsonPathSelector implements Selector {
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
if (object instanceof List) {
|
if (object instanceof List) {
|
||||||
return (List<String>) object;
|
List<Object> items = (List<Object>) object;
|
||||||
|
for (Object item : items) {
|
||||||
|
list.add(String.valueOf(item));
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
list.add(object.toString());
|
list.add(String.valueOf(object));
|
||||||
}
|
}
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -12,18 +10,17 @@ import java.util.List;
|
||||||
* @author code4crafter@gmail.com <br>
|
* @author code4crafter@gmail.com <br>
|
||||||
* @since 0.1.0
|
* @since 0.1.0
|
||||||
*/
|
*/
|
||||||
public class PlainText implements Selectable {
|
public class PlainText extends AbstractSelectable {
|
||||||
|
|
||||||
protected List<String> strings;
|
protected List<String> sourceTexts;
|
||||||
|
|
||||||
public PlainText(List<String> strings) {
|
public PlainText(List<String> sourceTexts) {
|
||||||
this.strings = strings;
|
this.sourceTexts = sourceTexts;
|
||||||
}
|
}
|
||||||
|
|
||||||
public PlainText(String text) {
|
public PlainText(String text) {
|
||||||
List<String> results = new ArrayList<String>();
|
this.sourceTexts = new ArrayList<String>();
|
||||||
results.add(text);
|
sourceTexts.add(text);
|
||||||
this.strings = results;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static PlainText create(String text) {
|
public static PlainText create(String text) {
|
||||||
|
@ -45,16 +42,6 @@ public class PlainText implements Selectable {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable css(String selector) {
|
|
||||||
return $(selector);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable css(String selector, String attrName) {
|
|
||||||
return $(selector, attrName);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable smartContent() {
|
public Selectable smartContent() {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
@ -66,79 +53,16 @@ public class PlainText implements Selectable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable regex(String regex) {
|
public List<Selectable> nodes() {
|
||||||
RegexSelector regexSelector = Selectors.regex(regex);
|
List<Selectable> nodes = new ArrayList<Selectable>(getSourceTexts().size());
|
||||||
return selectList(regexSelector, strings);
|
for (String string : getSourceTexts()) {
|
||||||
|
nodes.add(PlainText.create(string));
|
||||||
|
}
|
||||||
|
return nodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Selectable regex(String regex, int group) {
|
protected List<String> getSourceTexts() {
|
||||||
RegexSelector regexSelector = Selectors.regex(regex, group);
|
return sourceTexts;
|
||||||
return selectList(regexSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Selectable select(Selector selector, List<String> strings) {
|
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
for (String string : strings) {
|
|
||||||
String result = selector.select(string);
|
|
||||||
if (result != null) {
|
|
||||||
results.add(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new PlainText(results);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Selectable selectList(Selector selector, List<String> strings) {
|
|
||||||
List<String> results = new ArrayList<String>();
|
|
||||||
for (String string : strings) {
|
|
||||||
List<String> result = selector.selectList(string);
|
|
||||||
results.addAll(result);
|
|
||||||
}
|
|
||||||
return new PlainText(results);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable replace(String regex, String replacement) {
|
|
||||||
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
|
|
||||||
return select(replaceSelector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<String> all() {
|
|
||||||
return strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable jsonPath(String jsonPath) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String get() {
|
|
||||||
if (CollectionUtils.isNotEmpty(all())) {
|
|
||||||
return all().get(0);
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable select(Selector selector) {
|
|
||||||
return select(selector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Selectable selectList(Selector selector) {
|
|
||||||
return selectList(selector, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return get();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean match() {
|
|
||||||
return strings != null && strings.size() > 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -143,4 +143,10 @@ public interface Selectable {
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Selectable selectList(Selector selector);
|
public Selectable selectList(Selector selector);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get all nodes
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<Selectable> nodes();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import us.codecraft.xsoup.XPathEvaluator;
|
import us.codecraft.xsoup.XPathEvaluator;
|
||||||
import us.codecraft.xsoup.Xsoup;
|
import us.codecraft.xsoup.Xsoup;
|
||||||
|
@ -29,4 +30,23 @@ public class XpathSelector extends BaseElementSelector {
|
||||||
public List<String> selectList(Element element) {
|
public List<String> selectList(Element element) {
|
||||||
return xPathEvaluator.evaluate(element).list();
|
return xPathEvaluator.evaluate(element).list();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Element selectElement(Element element) {
|
||||||
|
List<Element> elements = selectElements(element);
|
||||||
|
if (CollectionUtils.isNotEmpty(elements)){
|
||||||
|
return elements.get(0);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Element> selectElements(Element element) {
|
||||||
|
return xPathEvaluator.evaluate(element).getElements();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasAttribute() {
|
||||||
|
return xPathEvaluator.hasAttribute();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
|
import com.github.dreamhead.moco.*;
|
||||||
|
import com.github.dreamhead.moco.Runnable;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.Page;
|
import us.codecraft.webmagic.Page;
|
||||||
|
@ -8,9 +13,12 @@ import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.selector.Html;
|
import us.codecraft.webmagic.selector.Html;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
|
import static com.github.dreamhead.moco.Moco.*;
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -31,7 +39,7 @@ public class HttpClientDownloaderTest {
|
||||||
public void testDownloader() {
|
public void testDownloader() {
|
||||||
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
|
||||||
Html html = httpClientDownloader.download("https://github.com");
|
Html html = httpClientDownloader.download("https://github.com");
|
||||||
assertTrue(!html.getText().isEmpty());
|
assertTrue(!html.getFirstSourceText().isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expected = IllegalArgumentException.class)
|
@Test(expected = IllegalArgumentException.class)
|
||||||
|
@ -52,4 +60,54 @@ public class HttpClientDownloaderTest {
|
||||||
assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2);
|
assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetHtmlCharset() throws Exception {
|
||||||
|
HttpServer server = httpserver(12306);
|
||||||
|
server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
|
||||||
|
server.get(by(uri("/meta4"))).response(with(text("<html>\n" +
|
||||||
|
" <head>\n" +
|
||||||
|
" <meta charset='gbk'/>\n" +
|
||||||
|
" </head>\n" +
|
||||||
|
" <body></body>\n" +
|
||||||
|
"</html>")),header("Content-Type",""));
|
||||||
|
server.get(by(uri("/meta5"))).response(with(text("<html>\n" +
|
||||||
|
" <head>\n" +
|
||||||
|
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=gbk\" />\n" +
|
||||||
|
" </head>\n" +
|
||||||
|
" <body></body>\n" +
|
||||||
|
"</html>")),header("Content-Type",""));
|
||||||
|
Runner.running(server, new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
String charset = getCharsetByUrl("http://127.0.0.1:12306/header");
|
||||||
|
assertEquals(charset, "gbk");
|
||||||
|
charset = getCharsetByUrl("http://127.0.0.1:12306/meta4");
|
||||||
|
assertEquals(charset, "gbk");
|
||||||
|
charset = getCharsetByUrl("http://127.0.0.1:12306/meta5");
|
||||||
|
assertEquals(charset, "gbk");
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getCharsetByUrl(String url) {
|
||||||
|
HttpClientDownloader downloader = new HttpClientDownloader();
|
||||||
|
Site site = Site.me();
|
||||||
|
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
|
||||||
|
// encoding in http header Content-Type
|
||||||
|
Request requestGBK = new Request(url);
|
||||||
|
CloseableHttpResponse httpResponse = null;
|
||||||
|
try {
|
||||||
|
httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
String charset = null;
|
||||||
|
try {
|
||||||
|
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
|
||||||
|
charset = downloader.getHtmlCharset(httpResponse,contentBytes);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
package us.codecraft.webmagic.pipeline;
|
||||||
|
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
import us.codecraft.webmagic.Request;
|
||||||
|
import us.codecraft.webmagic.ResultItems;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by ywooer on 2014/5/6 0006.
|
||||||
|
*/
|
||||||
|
public class FilePipelineTest {
|
||||||
|
|
||||||
|
private static ResultItems resultItems;
|
||||||
|
private static Task task;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void before() {
|
||||||
|
resultItems = new ResultItems();
|
||||||
|
resultItems.put("content", "webmagic 爬虫工具");
|
||||||
|
Request request = new Request("http://www.baidu.com");
|
||||||
|
resultItems.setRequest(request);
|
||||||
|
|
||||||
|
task = new Task() {
|
||||||
|
@Override
|
||||||
|
public String getUUID() {
|
||||||
|
return UUID.randomUUID().toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testProcess() {
|
||||||
|
FilePipeline filePipeline = new FilePipeline();
|
||||||
|
filePipeline.process(resultItems, task);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,8 +1,8 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
import static us.codecraft.webmagic.selector.Selectors.*;
|
import static us.codecraft.webmagic.selector.Selectors.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -16,19 +16,19 @@ public class ExtractorsTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEach() {
|
public void testEach() {
|
||||||
Assert.assertEquals("<a href=\"xxx\">aabbcc</a>", $("div h1 a").select(html));
|
assertThat($("div h1 a").select(html)).isEqualTo("<a href=\"xxx\">aabbcc</a>");
|
||||||
Assert.assertEquals("xxx", $("div h1 a", "href").select(html));
|
assertThat($("div h1 a", "href").select(html)).isEqualTo("xxx");
|
||||||
Assert.assertEquals("aabbcc", $("div h1 a", "innerHtml").select(html));
|
assertThat($("div h1 a", "innerHtml").select(html)).isEqualTo("aabbcc");
|
||||||
Assert.assertEquals("xxx", xpath("//a/@href").select(html));
|
assertThat(xpath("//a/@href").select(html)).isEqualTo("xxx");
|
||||||
Assert.assertEquals("xxx", regex("a href=\"(.*)\"").select(html));
|
assertThat(regex("a href=\"(.*)\"").select(html)).isEqualTo("xxx");
|
||||||
Assert.assertEquals("xxx", regex("(a href)=\"(.*)\"", 2).select(html));
|
assertThat(regex("(a href)=\"(.*)\"", 2).select(html)).isEqualTo("xxx");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCombo() {
|
public void testCombo() {
|
||||||
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
|
assertThat(and($("title"), regex("aa(bb)cc")).select(html2)).isEqualTo("bb");
|
||||||
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
|
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
|
||||||
Assert.assertEquals("aabbcc", or.select(html));
|
assertThat(or.select(html)).isEqualTo("aabbcc");
|
||||||
Assert.assertEquals("<title>aabbcc</title>", or.select(html2));
|
assertThat(or.select(html2)).isEqualTo("<title>aabbcc</title>");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmai.com <br>
|
* @author code4crafter@gmai.com <br>
|
||||||
*/
|
*/
|
||||||
|
@ -32,16 +33,16 @@ public class JsonPathSelectorTest {
|
||||||
"}";
|
"}";
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void testJsonPath() {
|
||||||
JsonPathSelector jsonPathSelector = new JsonPathSelector("$.store.book[*].author");
|
JsonPathSelector jsonPathSelector = new JsonPathSelector("$.store.book[*].author");
|
||||||
String select = jsonPathSelector.select(text);
|
String select = jsonPathSelector.select(text);
|
||||||
List<String> list = jsonPathSelector.selectList(text);
|
List<String> list = jsonPathSelector.selectList(text);
|
||||||
Assert.assertNotNull(select);
|
assertThat(select).isEqualTo("Nigel Rees");
|
||||||
Assert.assertNotNull(list);
|
assertThat(list).contains("Nigel Rees","Evelyn Waugh");
|
||||||
jsonPathSelector = new JsonPathSelector("$.store.book[?(@.category == 'reference')]");
|
jsonPathSelector = new JsonPathSelector("$.store.book[?(@.category == 'reference')]");
|
||||||
list = jsonPathSelector.selectList(text);
|
list = jsonPathSelector.selectList(text);
|
||||||
select = jsonPathSelector.select(text);
|
select = jsonPathSelector.select(text);
|
||||||
Assert.assertNotNull(list);
|
assertThat(select).isEqualTo("{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\",\"category\":\"reference\",\"price\":8.95}");
|
||||||
Assert.assertNotNull(select);
|
assertThat(list).contains("{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\",\"category\":\"reference\",\"price\":8.95}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,9 +12,17 @@ public class JsonTest {
|
||||||
|
|
||||||
private String text = "callback({\"name\":\"json\"})";
|
private String text = "callback({\"name\":\"json\"})";
|
||||||
|
|
||||||
|
private String textWithBrackerInContent = "callback({\"name\":\"json)\"})";
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRemovePadding() throws Exception {
|
public void testRemovePadding() throws Exception {
|
||||||
String name = new Json(text).removePadding("callback").jsonPath("$.name").get();
|
String name = new Json(text).removePadding("callback").jsonPath("$.name").get();
|
||||||
assertThat(name).isEqualTo("json");
|
assertThat(name).isEqualTo("json");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRemovePaddingForQuotes() throws Exception {
|
||||||
|
String name = new Json(textWithBrackerInContent).removePadding("callback").jsonPath("$.name").get();
|
||||||
|
assertThat(name).isEqualTo("json)");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
package us.codecraft.webmagic.selector;
|
package us.codecraft.webmagic.selector;
|
||||||
|
|
||||||
import org.junit.Assert;
|
import org.assertj.core.api.Assertions;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -20,6 +20,6 @@ public class RegexSelectorTest {
|
||||||
String source = "(hello world";
|
String source = "(hello world";
|
||||||
RegexSelector regexSelector = new RegexSelector(regex);
|
RegexSelector regexSelector = new RegexSelector(regex);
|
||||||
String select = regexSelector.select(source);
|
String select = regexSelector.select(source);
|
||||||
Assert.assertEquals(source,select);
|
Assertions.assertThat(select).isEqualTo(source);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,4 +23,11 @@ public class SelectorTest {
|
||||||
assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall);
|
assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall);
|
||||||
assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall);
|
assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNodes() throws Exception {
|
||||||
|
Html selectable = new Html(html);
|
||||||
|
List<Selectable> links = selectable.xpath("//a").nodes();
|
||||||
|
assertThat(links.get(0).links().get()).isEqualTo("http://whatever.com/aaa");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,11 +49,6 @@ public class UrlUtilsTest {
|
||||||
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
|
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test(){
|
|
||||||
UrlUtils.canonicalizeUrl("start tag", "http://www.dianping.com/");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetDomain(){
|
public void testGetDomain(){
|
||||||
String url = "http://www.dianping.com/aa/";
|
String url = "http://www.dianping.com/aa/";
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -78,4 +78,17 @@ public class GithubRepo implements HasKey {
|
||||||
public int getFork() {
|
public int getFork() {
|
||||||
return fork;
|
return fork;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "GithubRepo{" +
|
||||||
|
"name='" + name + '\'' +
|
||||||
|
", author='" + author + '\'' +
|
||||||
|
", readme='" + readme + '\'' +
|
||||||
|
", language=" + language +
|
||||||
|
", star=" + star +
|
||||||
|
", fork=" + fork +
|
||||||
|
", url='" + url + '\'' +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
package us.codecraft.webmagic.example;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.model.PageMapper;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com <br>
|
||||||
|
* @since 0.3.2
|
||||||
|
*/
|
||||||
|
public class GithubRepoPageMapper implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
|
||||||
|
|
||||||
|
private PageMapper<GithubRepo> githubRepoPageMapper = new PageMapper<GithubRepo>(GithubRepo.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
|
||||||
|
GithubRepo githubRepo = githubRepoPageMapper.get(page);
|
||||||
|
if (githubRepo == null) {
|
||||||
|
page.setSkip(true);
|
||||||
|
} else {
|
||||||
|
page.putField("repo", githubRepo);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
Spider.create(new GithubRepoPageMapper()).addUrl("https://github.com/code4craft").thread(5).run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,29 @@
|
||||||
|
package us.codecraft.webmagic.model;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
* @since 0.5.2
|
||||||
|
*/
|
||||||
|
public class PageMapper<T> {
|
||||||
|
|
||||||
|
private Class<T> clazz;
|
||||||
|
|
||||||
|
private PageModelExtractor pageModelExtractor;
|
||||||
|
|
||||||
|
public PageMapper(Class<T> clazz) {
|
||||||
|
this.clazz = clazz;
|
||||||
|
this.pageModelExtractor = PageModelExtractor.create(clazz);
|
||||||
|
}
|
||||||
|
|
||||||
|
public T get(Page page) {
|
||||||
|
return (T) pageModelExtractor.process(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<T> getAll(Page page) {
|
||||||
|
return (List<T>) pageModelExtractor.process(page);
|
||||||
|
}
|
||||||
|
}
|
|
@ -8,7 +8,6 @@ import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.utils.FilePersistentBase;
|
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
@ -38,7 +37,7 @@ public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
|
||||||
public void process(ResultItems resultItems, Task task) {
|
public void process(ResultItems resultItems, Task task) {
|
||||||
String path = this.path + "/" + task.getUUID() + "/";
|
String path = this.path + "/" + task.getUUID() + "/";
|
||||||
try {
|
try {
|
||||||
PrintWriter printWriter = new PrintWriter(new FileWriter(new File(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
|
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
|
||||||
printWriter.write(JSON.toJSONString(resultItems.getAll()));
|
printWriter.write(JSON.toJSONString(resultItems.getAll()));
|
||||||
printWriter.close();
|
printWriter.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
package us.codecraft.webmagic.samples;
|
||||||
|
|
||||||
|
import us.codecraft.webmagic.Page;
|
||||||
|
import us.codecraft.webmagic.Site;
|
||||||
|
import us.codecraft.webmagic.Spider;
|
||||||
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.samples.pipeline.OneFilePipeline;
|
||||||
|
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||||
|
import us.codecraft.webmagic.selector.Selectable;
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class MamacnPageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void process(Page page) {
|
||||||
|
List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
|
||||||
|
StringBuilder accum = new StringBuilder();
|
||||||
|
for (Selectable node : nodes) {
|
||||||
|
accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
|
||||||
|
accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
|
||||||
|
}
|
||||||
|
page.putField("",accum.toString());
|
||||||
|
if (accum.length() == 0) {
|
||||||
|
page.setSkip(true);
|
||||||
|
}
|
||||||
|
page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Site getSite() {
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
|
||||||
|
Spider.create(new MamacnPageProcessor())
|
||||||
|
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
|
||||||
|
.addUrl("http://www.mama.cn/photo/t1-p1.html")
|
||||||
|
.addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
|
||||||
|
.thread(5)
|
||||||
|
.run();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
package us.codecraft.webmagic.samples.pipeline;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import us.codecraft.webmagic.ResultItems;
|
||||||
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
import us.codecraft.webmagic.utils.FilePersistentBase;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafer@gmail.com
|
||||||
|
*/
|
||||||
|
public class OneFilePipeline extends FilePersistentBase implements Pipeline {
|
||||||
|
|
||||||
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private PrintWriter printWriter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a FilePipeline with default path"/data/webmagic/"
|
||||||
|
*/
|
||||||
|
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
|
||||||
|
this("/data/webmagic/");
|
||||||
|
}
|
||||||
|
|
||||||
|
public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException {
|
||||||
|
setPath(path);
|
||||||
|
printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void process(ResultItems resultItems, Task task) {
|
||||||
|
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
|
||||||
|
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
|
||||||
|
if (entry.getValue() instanceof Iterable) {
|
||||||
|
Iterable value = (Iterable) entry.getValue();
|
||||||
|
printWriter.println(entry.getKey() + ":");
|
||||||
|
for (Object o : value) {
|
||||||
|
printWriter.println(o);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printWriter.flush();
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<artifactId>webmagic-parent</artifactId>
|
<artifactId>webmagic-parent</artifactId>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
|
|
@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-extension</artifactId>
|
<artifactId>webmagic-extension</artifactId>
|
||||||
<version>0.5.1</version>
|
<version>0.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue