diff --git a/.gitignore b/.gitignore
index d7d63fe..0175dba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,6 @@ out/
.idea
.classpath
.project
+.settings/
+bin/
+.myeclipse
diff --git a/README.md b/README.md
index b23bf83..deb17d4 100644
--- a/README.md
+++ b/README.md
@@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
us.codecraft
webmagic-core
- 0.5.1
+ 0.5.2
us.codecraft
webmagic-extension
- 0.5.1
+ 0.5.2
```
diff --git a/pom.xml b/pom.xml
index c17e1a3..d5a107e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
7
us.codecraft
- 0.5.1
+ 0.5.2
4.0.0
pom
@@ -88,13 +88,25 @@
us.codecraft
xsoup
- 0.2.2
+ 0.2.4
com.alibaba
fastjson
1.1.37
+
+ com.github.dreamhead
+ moco-core
+ 0.9.1
+ test
+
+
+ org.slf4j
+ slf4j-simple
+
+
+
log4j
log4j
@@ -230,22 +242,44 @@
- release-sign-artifacts
-
-
- performRelease
- true
-
-
+ release
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 2.2.1
+
+
+ package
+
+ jar-no-fork
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.9.1
+
+
+ package
+
+ jar
+
+
+
+
+
org.apache.maven.plugins
maven-gpg-plugin
- 1.1
+ 1.5
- sign-artifacts
verify
sign
@@ -253,10 +287,29 @@
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6
+ true
+
+ sonatype-nexus-staging
+ https://oss.sonatype.org/
+ true
+
+
+
+
+ sonatype-nexus-snapshots
+ https://oss.sonatype.org/content/repositories/snapshots/
+
+
+ sonatype-nexus-staging
+ https://oss.sonatype.org/service/local/staging/deploy/maven2/
+
+
-
-
diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml
index 0dbb369..3653649 100644
--- a/webmagic-avalon/pom.xml
+++ b/webmagic-avalon/pom.xml
@@ -3,7 +3,7 @@
webmagic-parent
us.codecraft
- 0.5.1-SNAPSHOT
+ 0.5.2-SNAPSHOT-SNAPSHOT
4.0.0
diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml
index ed364c1..ec73349 100644
--- a/webmagic-avalon/webmagic-admin/pom.xml
+++ b/webmagic-avalon/webmagic-admin/pom.xml
@@ -3,7 +3,7 @@
webmagic-avalon
us.codecraft
- 0.5.1-SNAPSHOT
+ 0.5.2-SNAPSHOT-SNAPSHOT
4.0.0
diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml
index 9c7199a..cec0b83 100644
--- a/webmagic-avalon/webmagic-avalon-common/pom.xml
+++ b/webmagic-avalon/webmagic-avalon-common/pom.xml
@@ -3,7 +3,7 @@
webmagic-avalon
us.codecraft
- 0.5.1-SNAPSHOT
+ 0.5.2-SNAPSHOT-SNAPSHOT
4.0.0
diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml
index ebc5174..1d5df01 100644
--- a/webmagic-avalon/webmagic-worker/pom.xml
+++ b/webmagic-avalon/webmagic-worker/pom.xml
@@ -3,7 +3,7 @@
webmagic-avalon
us.codecraft
- 0.5.1-SNAPSHOT
+ 0.5.2-SNAPSHOT-SNAPSHOT
4.0.0
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 6973109..85e9c4a 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -3,7 +3,7 @@
us.codecraft
webmagic-parent
- 0.5.1
+ 0.5.2
4.0.0
@@ -35,6 +35,11 @@
xsoup
+
+ com.github.dreamhead
+ moco-core
+
+
org.slf4j
slf4j-api
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index 1f8a194..9a0321e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -18,6 +18,8 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
+ public static final String STATUS_CODE = "statusCode";
+ public static final String PROXY = "proxy";
private String url;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index a7c7bf8..01a4c75 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -3,6 +3,8 @@ package us.codecraft.webmagic;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.apache.http.HttpHost;
+
+import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
@@ -47,6 +49,8 @@ public class Site {
private HttpHost httpProxy;
+ private ProxyPool httpProxyPool=new ProxyPool();
+
private boolean useGzip = true;
/**
@@ -438,4 +442,32 @@ public class Site {
", headers=" + headers +
'}';
}
+
+ /**
+ * Set httpProxyPool, String[0]:ip, String[1]:port
+ *
+ * @return this
+ */
+ public Site setHttpProxyPool(List httpProxyList) {
+ this.httpProxyPool=new ProxyPool(httpProxyList);
+ return this;
+ }
+
+ public ProxyPool getHttpProxyPool() {
+ return httpProxyPool;
+ }
+
+ public HttpHost getHttpProxyFromPool() {
+ return httpProxyPool.getProxy();
+ }
+
+ public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
+ httpProxyPool.returnProxy(proxy,statusCode);
+ }
+
+ public Site setProxyReuseInterval(int reuseInterval) {
+ this.httpProxyPool.setReuseInterval(reuseInterval);
+ return this;
+ }
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 81cf179..6f6453b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -2,6 +2,7 @@ package us.codecraft.webmagic;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
+import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
@@ -324,6 +325,10 @@ public class Spider implements Runnable, Task {
onError(requestFinal);
logger.error("process request " + requestFinal + " error", e);
} finally {
+ if (site.getHttpProxyPool().isEnable()) {
+ site.returnHttpProxyToPool((HttpHost) requestFinal.getExtra(Request.PROXY), (Integer) requestFinal
+ .getExtra(Request.STATUS_CODE));
+ }
pageCount.incrementAndGet();
signalNewUrl();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index eeae70e..bdafea7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -2,6 +2,8 @@ package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe;
@@ -12,17 +14,22 @@ import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
+import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
@@ -79,16 +86,13 @@ public class HttpClientDownloader extends AbstractDownloader {
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
+ int statusCode=0;
try {
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
httpResponse = getHttpClient(site).execute(httpUriRequest);
- int statusCode = httpResponse.getStatusLine().getStatusCode();
+ statusCode = httpResponse.getStatusLine().getStatusCode();
+ request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
- //charset
- if (charset == null) {
- String value = httpResponse.getEntity().getContentType().getValue();
- charset = UrlUtils.getCharset(value);
- }
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
return page;
@@ -104,6 +108,7 @@ public class HttpClientDownloader extends AbstractDownloader {
onError(request);
return null;
} finally {
+ request.putExtra(Request.STATUS_CODE, statusCode);
try {
if (httpResponse != null) {
//ensure the connection is released back to pool
@@ -136,9 +141,11 @@ public class HttpClientDownloader extends AbstractDownloader {
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
- if (site != null && site.getHttpProxy() != null) {
- requestConfigBuilder.setProxy(site.getHttpProxy());
- }
+ if (site.getHttpProxyPool().isEnable()) {
+ HttpHost host = site.getHttpProxyFromPool();
+ requestConfigBuilder.setProxy(host);
+ request.putExtra(Request.PROXY, host);
+ }
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
@@ -168,7 +175,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
- String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
+ String content = getContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
@@ -176,4 +183,57 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
+
+ protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
+ if (charset == null) {
+ byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
+ String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
+ if (htmlCharset != null) {
+ return new String(contentBytes, htmlCharset);
+ } else {
+ logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
+ return new String(contentBytes);
+ }
+ } else {
+ return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
+ }
+ }
+
+ protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
+ String charset;
+ // charset
+ // 1、encoding in http header Content-Type
+ String value = httpResponse.getEntity().getContentType().getValue();
+ charset = UrlUtils.getCharset(value);
+ if (StringUtils.isNotBlank(charset)) {
+ logger.debug("Auto get charset: {}", charset);
+ return charset;
+ }
+ // use default charset to decode first time
+ Charset defaultCharset = Charset.defaultCharset();
+ String content = new String(contentBytes, defaultCharset.name());
+ // 2、charset in meta
+ if (StringUtils.isNotEmpty(content)) {
+ Document document = Jsoup.parse(content);
+ Elements links = document.select("meta");
+ for (Element link : links) {
+ // 2.1、html4.01
+ String metaContent = link.attr("content");
+ String metaCharset = link.attr("charset");
+ if (metaContent.indexOf("charset") != -1) {
+ metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
+ charset = metaContent.split("=")[1];
+ break;
+ }
+ // 2.2、html5
+ else if (StringUtils.isNotEmpty(metaCharset)) {
+ charset = metaCharset;
+ break;
+ }
+ }
+ }
+ logger.debug("Auto get charset: {}", charset);
+ // 3、todo use tools as cpdetector for content decode
+ return charset;
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
index 8eab426..57d6eea 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
@@ -4,12 +4,14 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Map;
@@ -39,7 +41,7 @@ public class FilePipeline extends FilePersistentBase implements Pipeline {
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
- PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
+ PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
new file mode 100644
index 0000000..27e6b52
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -0,0 +1,172 @@
+package us.codecraft.webmagic.proxy;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Delayed;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.http.HttpHost;
+
+/**
+ * >>>>Proxy Status
+ +----------+ +-----+
+ | last use | | new |
+ +-----+----+ +---+-+
+ | +------+ |
+ +->| init |<--+
+ +--+---+
+ |
+ v
+ +--------+
+ +--->| borrow |
+ | +---+----+
+ | |+------------------+
+ | v
+ | +--------+
+ | | in use | Respone Time
+ | +---+----+
+ | |+------------------+
+ | v
+ | +--------+
+ | | return |
+ | +---+----+
+ | |+-------------------+
+ | v
+ | +-------+ reuse interval
+ | | delay | (delay time)
+ | +---+---+
+ | |+-------------------+
+ | v
+ | +------+
+ | | idle | idle time
+ | +---+--+
+ | |+-------------------+
+ +--------+
+ */
+public class Proxy implements Delayed, Serializable {
+
+ private static final long serialVersionUID = 228939737383625551L;
+ public static final int ERROR_403 = 403;
+ public static final int ERROR_404 = 404;
+ public static final int ERROR_BANNED = 10000;
+ public static final int ERROR_Proxy = 10001;
+ public static final int SUCCESS = 200;
+
+ private final HttpHost httpHost;
+
+ private int reuseTimeInterval = 1500;// ms
+ private Long canReuseTime = 0L;
+ private Long lastBorrowTime = System.currentTimeMillis();
+ private Long responseTime = 0L;
+ private Long idleTime = 0L;
+
+ private int failedNum = 0;
+ private int successNum = 0;
+ private int borrowNum = 0;
+
+ private List failedErrorType = new ArrayList();
+
+ Proxy(HttpHost httpHost) {
+ this.httpHost = httpHost;
+ this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
+ }
+
+ Proxy(HttpHost httpHost, int reuseInterval) {
+ this.httpHost = httpHost;
+ this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
+ }
+
+ public int getSuccessNum() {
+ return successNum;
+ }
+
+ public void successNumIncrement(int increment) {
+ this.successNum += increment;
+ }
+
+ public Long getLastUseTime() {
+ return lastBorrowTime;
+ }
+
+ public void setLastBorrowTime(Long lastBorrowTime) {
+ this.lastBorrowTime = lastBorrowTime;
+ }
+
+ public void recordResponse() {
+ this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
+ this.lastBorrowTime = System.currentTimeMillis();
+ }
+
+ public List getFailedErrorType() {
+ return failedErrorType;
+ }
+
+ public void setFailedErrorType(List failedErrorType) {
+ this.failedErrorType = failedErrorType;
+ }
+
+ public void fail(int failedErrorType) {
+ this.failedNum++;
+ this.failedErrorType.add(failedErrorType);
+ }
+
+ public void setFailedNum(int failedNum) {
+ this.failedNum = failedNum;
+ }
+
+ public int getFailedNum() {
+ return failedNum;
+ }
+
+ public String getFailedType() {
+ String re = "";
+ for (Integer i : this.failedErrorType) {
+ re += i + " . ";
+ }
+ return re;
+ }
+
+ public HttpHost getHttpHost() {
+ return httpHost;
+ }
+
+ public int getReuseTimeInterval() {
+ return reuseTimeInterval;
+ }
+
+ public void setReuseTimeInterval(int reuseTimeInterval) {
+ this.reuseTimeInterval = reuseTimeInterval;
+ this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
+
+ }
+
+ @Override
+ public long getDelay(TimeUnit unit) {
+ return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS);
+ }
+
+ @Override
+ public int compareTo(Delayed o) {
+ Proxy that = (Proxy) o;
+ return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
+
+ }
+
+ @Override
+ public String toString() {
+
+ String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime,
+ successNum * 100.0 / borrowNum, borrowNum);
+ return re;
+
+ }
+
+ public void borrowNumIncrement(int increment) {
+ this.borrowNum += increment;
+ }
+
+ public int getBorrowNum() {
+ return borrowNum;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
new file mode 100644
index 0000000..73c5ed6
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
@@ -0,0 +1,290 @@
+package us.codecraft.webmagic.proxy;
+
+import org.apache.http.HttpHost;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.*;
+import java.util.Map.Entry;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.DelayQueue;
+
+/**
+ * ClassName:ProxyPool
+ *
+ * @see
+ * @Function: TODO ADD FUNCTION
+ * @author ch
+ * @version Ver 1.0
+ * @Date 2014-2-14 下午01:10:04
+ */
+public class ProxyPool {
+
+ private Logger logger = LoggerFactory.getLogger(getClass());
+
+ private BlockingQueue proxyQueue = new DelayQueue();
+ private Map allProxy = new ConcurrentHashMap();
+
+ private int reuseInterval = 1500;// ms
+ private int reviveTime = 2 * 60 * 60 * 1000;// ms
+
+ private boolean isEnable = false;
+ private boolean validateWhenInit = false;
+ private String proxyFile = "data/lastUse.proxy";
+
+ private Timer timer = new Timer(true);
+ private TimerTask saveProxyTask = new TimerTask() {
+
+ @Override
+ public void run() {
+ saveProxyList();
+ logger.info(allProxyStatus());
+ }
+ };
+
+ public ProxyPool() {
+
+ }
+
+ public ProxyPool(List httpProxyList) {
+ readProxyList();
+ addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
+ timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000);
+ }
+
+ private void saveProxyList() {
+ if (allProxy.size() == 0) {
+ return;
+ }
+ try {
+ ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile));
+ os.writeObject(prepareForSaving());
+ os.close();
+ logger.info("save proxy");
+ } catch (FileNotFoundException e) {
+ logger.error("proxy file not found", e);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private Map prepareForSaving() {
+ Map tmp = new HashMap();
+ for (Entry e : allProxy.entrySet()) {
+ Proxy p = e.getValue();
+ p.setFailedNum(0);
+ tmp.put(e.getKey(), p);
+ }
+ return tmp;
+ }
+
+ private void readProxyList() {
+ try {
+ ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile));
+ addProxy((Map) is.readObject());
+ is.close();
+ } catch (FileNotFoundException e) {
+ logger.error("proxy file not found", e);
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (ClassNotFoundException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private void addProxy(Map httpProxyMap) {
+ isEnable = true;
+ for (Entry entry : httpProxyMap.entrySet()) {
+ try {
+ if (allProxy.containsKey(entry.getKey())) {
+ continue;
+ }
+ if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) {
+ entry.getValue().setFailedNum(0);
+ entry.getValue().setReuseTimeInterval(reuseInterval);
+ proxyQueue.add(entry.getValue());
+ allProxy.put(entry.getKey(), entry.getValue());
+ }
+ } catch (NumberFormatException e) {
+ logger.error("HttpHost init error:", e);
+ }
+ }
+ logger.info("proxy pool size>>>>" + allProxy.size());
+ }
+
+ public void addProxy(String[]... httpProxyList) {
+ isEnable = true;
+ for (String[] s : httpProxyList) {
+ try {
+ if (allProxy.containsKey(s[0])) {
+ continue;
+ }
+ HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
+ if (!validateWhenInit || ProxyUtil.validateProxy(item)) {
+ Proxy p = new Proxy(item, reuseInterval);
+ proxyQueue.add(p);
+ allProxy.put(s[0], p);
+ }
+ } catch (NumberFormatException e) {
+ logger.error("HttpHost init error:", e);
+ } catch (UnknownHostException e) {
+ logger.error("HttpHost init error:", e);
+ }
+ }
+ logger.info("proxy pool size>>>>" + allProxy.size());
+ }
+
+ public HttpHost getProxy() {
+ Proxy proxy = null;
+ try {
+ Long time = System.currentTimeMillis();
+ proxy = proxyQueue.take();
+ double costTime = (System.currentTimeMillis() - time) / 1000.0;
+ if (costTime > reuseInterval) {
+ logger.info("get proxy time >>>> " + costTime);
+ }
+ Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
+ p.setLastBorrowTime(System.currentTimeMillis());
+ p.borrowNumIncrement(1);
+ } catch (InterruptedException e) {
+ logger.error("get proxy error", e);
+ }
+ if (proxy == null) {
+ throw new NoSuchElementException();
+ }
+ return proxy.getHttpHost();
+ }
+
+ public void returnProxy(HttpHost host, int statusCode) {
+ Proxy p = allProxy.get(host.getAddress().getHostAddress());
+ if (p == null) {
+ return;
+ }
+ switch (statusCode) {
+ case Proxy.SUCCESS:
+ p.setReuseTimeInterval(reuseInterval);
+ p.setFailedNum(0);
+ p.setFailedErrorType(new ArrayList());
+ p.recordResponse();
+ p.successNumIncrement(1);
+ break;
+ case Proxy.ERROR_403:
+ // banned,try larger interval
+ p.fail(Proxy.ERROR_403);
+ p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
+ logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
+ break;
+ case Proxy.ERROR_BANNED:
+ p.fail(Proxy.ERROR_BANNED);
+ p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
+ logger.warn("this proxy is banned >>>> " + p.getHttpHost());
+ logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
+ break;
+ case Proxy.ERROR_404:
+ //p.fail(Proxy.ERROR_404);
+ // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
+ break;
+ default:
+ p.fail(statusCode);
+ break;
+ }
+ if (p.getFailedNum() > 20) {
+ // allProxy.remove(host.getAddress().getHostAddress());
+ p.setReuseTimeInterval(reviveTime);
+ logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
+ return;
+ }
+ if (p.getFailedNum()%5==0) {
+ if (!ProxyUtil.validateProxy(host)) {
+ // allProxy.remove(host.getAddress().getHostAddress());
+ p.setReuseTimeInterval(reviveTime);
+ logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
+ return;
+ }
+ }
+ try {
+ proxyQueue.put(p);
+ } catch (InterruptedException e) {
+ logger.warn("proxyQueue return proxy error", e);
+ }
+ }
+
+ public String allProxyStatus() {
+ String re = "all proxy info >>>> \n";
+ for (Entry entry : allProxy.entrySet()) {
+ re += entry.getValue().toString() + "\n";
+ }
+ return re;
+
+ }
+
+ public int getIdleNum() {
+ return proxyQueue.size();
+ }
+
+ public int getReuseInterval() {
+ return reuseInterval;
+ }
+
+ public void setReuseInterval(int reuseInterval) {
+ this.reuseInterval = reuseInterval;
+ }
+
+ public static List getProxyList() {
+ List proxyList = new ArrayList();
+ BufferedReader br = null;
+ try {
+ br = new BufferedReader(new FileReader(new File("proxy.txt")));
+
+ String line = "";
+ while ((line = br.readLine()) != null) {
+ proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return proxyList;
+ }
+
+ public static void main(String[] args) throws IOException {
+ ProxyPool proxyPool = new ProxyPool(getProxyList());
+ proxyPool.setReuseInterval(10000);
+ // proxyPool.saveProxyList();
+
+ while (true) {
+ List httphostList = new ArrayList();
+ System.in.read();
+ int i = 0;
+ while (proxyPool.getIdleNum() > 2) {
+ HttpHost httphost = proxyPool.getProxy();
+ httphostList.add(httphost);
+ // proxyPool.proxyPool.use(httphost);
+ proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString());
+ i++;
+ }
+ System.out.println(proxyPool.allProxyStatus());
+ System.in.read();
+ for (i = 0; i < httphostList.size(); i++) {
+ proxyPool.returnProxy(httphostList.get(i), 200);
+ proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString());
+ }
+ System.out.println(proxyPool.allProxyStatus());
+ System.in.read();
+ }
+
+ }
+
+ public void enable(boolean isEnable) {
+ this.isEnable = isEnable;
+ }
+
+ public boolean isEnable() {
+ return isEnable;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java
new file mode 100644
index 0000000..f045e0d
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java
@@ -0,0 +1,101 @@
+package us.codecraft.webmagic.proxy;
+
+import java.io.IOException;
+import java.net.Inet6Address;
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import java.net.NetworkInterface;
+import java.net.Socket;
+import java.net.SocketException;
+import java.util.Enumeration;
+
+import org.apache.http.HttpHost;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * ClassName:ProxyUtil
+ *
+ * @see
+ * @author ch
+ * @version Ver 1.0
+ * @Date 2014-2-16 下午04:20:07
+ */
+public class ProxyUtil {
+ // TODO 改为单例
+ private static InetAddress localAddr;
+ private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class);
+ static {
+ init();
+ }
+
+ private static void init() {
+ Enumeration localAddrs;
+ try {
+ NetworkInterface ni = NetworkInterface.getByName("eth7");
+ if (ni == null) {
+ logger.error("choose NetworkInterface\n" + getNetworkInterface());
+ }
+ localAddrs = ni.getInetAddresses();
+ while (localAddrs.hasMoreElements()) {
+ InetAddress tmp = localAddrs.nextElement();
+ if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
+ localAddr = tmp;
+ logger.info("local IP:" + localAddr.getHostAddress());
+ break;
+ }
+ }
+ } catch (Exception e) {
+ logger.error("Failure when init ProxyUtil", e);
+ logger.error("choose NetworkInterface\n" + getNetworkInterface());
+ }
+
+ }
+
+ public static boolean validateProxy(HttpHost p) {
+ if (localAddr == null) {
+ logger.error("cannot get local ip");
+ return false;
+ }
+ boolean isReachable = false;
+ Socket socket = null;
+ try {
+ socket = new Socket();
+ socket.bind(new InetSocketAddress(localAddr, 0));
+ InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort());
+ socket.connect(endpointSocketAddr, 3000);
+ logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p);
+ isReachable = true;
+ } catch (IOException e) {
+ logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p);
+ } finally {
+ if (socket != null) {
+ try {
+ socket.close();
+ } catch (IOException e) {
+ logger.warn("Error occurred while closing socket of validating proxy", e);
+ }
+ }
+ }
+ return isReachable;
+ }
+
+ private static String getNetworkInterface() {
+ String networkInterfaceName = "";
+ Enumeration enumeration = null;
+ try {
+ enumeration = NetworkInterface.getNetworkInterfaces();
+ } catch (SocketException e1) {
+ e1.printStackTrace();
+ }
+ while (enumeration.hasMoreElements()) {
+ NetworkInterface networkInterface = enumeration.nextElement();
+ networkInterfaceName += networkInterface.toString() + '\n';
+ Enumeration addr = networkInterface.getInetAddresses();
+ while (addr.hasMoreElements()) {
+ networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n";
+ }
+ }
+ return networkInterfaceName;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
new file mode 100644
index 0000000..e2bb552
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
@@ -0,0 +1,109 @@
+package us.codecraft.webmagic.selector;
+
+import org.apache.commons.collections.CollectionUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author code4crafer@gmail.com
+ * @since 0.5.2
+ */
+public abstract class AbstractSelectable implements Selectable {
+
+ protected abstract List getSourceTexts();
+
+ @Override
+ public Selectable css(String selector) {
+ return $(selector);
+ }
+
+ @Override
+ public Selectable css(String selector, String attrName) {
+ return $(selector, attrName);
+ }
+
+ protected Selectable select(Selector selector, List strings) {
+ List results = new ArrayList();
+ for (String string : strings) {
+ String result = selector.select(string);
+ if (result != null) {
+ results.add(result);
+ }
+ }
+ return new PlainText(results);
+ }
+
+ protected Selectable selectList(Selector selector, List strings) {
+ List results = new ArrayList();
+ for (String string : strings) {
+ List result = selector.selectList(string);
+ results.addAll(result);
+ }
+ return new PlainText(results);
+ }
+
+ @Override
+ public List all() {
+ return getSourceTexts();
+ }
+
+ @Override
+ public Selectable jsonPath(String jsonPath) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public String get() {
+ if (CollectionUtils.isNotEmpty(all())) {
+ return all().get(0);
+ } else {
+ return null;
+ }
+ }
+
+ @Override
+ public Selectable select(Selector selector) {
+ return select(selector, getSourceTexts());
+ }
+
+ @Override
+ public Selectable selectList(Selector selector) {
+ return selectList(selector, getSourceTexts());
+ }
+
+ @Override
+ public Selectable regex(String regex) {
+ RegexSelector regexSelector = Selectors.regex(regex);
+ return selectList(regexSelector, getSourceTexts());
+ }
+
+ @Override
+ public Selectable regex(String regex, int group) {
+ RegexSelector regexSelector = Selectors.regex(regex, group);
+ return selectList(regexSelector, getSourceTexts());
+ }
+
+ @Override
+ public Selectable replace(String regex, String replacement) {
+ ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
+ return select(replaceSelector, getSourceTexts());
+ }
+
+ public String getFirstSourceText() {
+ if (getSourceTexts() != null && getSourceTexts().size() > 0) {
+ return getSourceTexts().get(0);
+ }
+ return null;
+ }
+
+ @Override
+ public String toString() {
+ return get();
+ }
+
+ @Override
+ public boolean match() {
+ return getSourceTexts() != null && getSourceTexts().size() > 0;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
index 7d9035f..bbc7217 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
@@ -1,6 +1,7 @@
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
@@ -28,4 +29,25 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
}
}
+ public Element selectElement(String text) {
+ if (text != null) {
+ return selectElement(Jsoup.parse(text));
+ }
+ return null;
+ }
+
+ public List selectElements(String text) {
+ if (text != null) {
+ return selectElements(Jsoup.parse(text));
+ } else {
+ return new ArrayList();
+ }
+ }
+
+ public abstract Element selectElement(Element element);
+
+ public abstract List selectElements(Element element);
+
+ public abstract boolean hasAttribute();
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
index 185db74..6a638db 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
@@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public String select(Element element) {
- Elements elements = element.select(selectorText);
+ List elements = selectElements(element);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
@@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public List selectList(Element doc) {
List strings = new ArrayList();
- Elements elements = doc.select(selectorText);
+ List elements = selectElements(doc);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
String value = getValue(element);
@@ -78,4 +78,23 @@ public class CssSelector extends BaseElementSelector {
}
return strings;
}
+
+ @Override
+ public Element selectElement(Element element) {
+ Elements elements = element.select(selectorText);
+ if (CollectionUtils.isNotEmpty(elements)) {
+ return elements.get(0);
+ }
+ return null;
+ }
+
+ @Override
+ public List selectElements(Element element) {
+ return element.select(selectorText);
+ }
+
+ @Override
+ public boolean hasAttribute() {
+ return attrName != null;
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index 34386b5..7b593ed 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
/**
@@ -14,7 +15,7 @@ import java.util.List;
* @author code4crafter@gmail.com
* @since 0.1.0
*/
-public class Html extends PlainText {
+public class Html extends HtmlNode {
private Logger logger = LoggerFactory.getLogger(getClass());
@@ -23,123 +24,26 @@ public class Html extends PlainText {
*/
private Document document;
- private boolean needInitCache = true;
-
- public Html(List strings) {
- super(strings);
- }
-
public Html(String text) {
- super(text);
- }
-
- public Html(List strings, boolean needInitCache) {
- super(strings);
- this.needInitCache = needInitCache;
- }
-
- public Html(String text, boolean needInitCache) {
- super(text);
- this.needInitCache = needInitCache;
- }
-
- /**
- * lazy init
- */
- private void initDocument() {
- if (this.document == null && needInitCache) {
- needInitCache = false;
- //just init once whether the parsing succeeds or not
- try {
- this.document = Jsoup.parse(getText());
- } catch (Exception e) {
- logger.warn("parse document error ", e);
- }
+ try {
+ this.document = Jsoup.parse(text);
+ } catch (Exception e) {
+ this.document = null;
+ logger.warn("parse document error ", e);
}
}
public Html(Document document) {
- super(document.html());
this.document = document;
}
- public static Html create(String text) {
- return new Html(text);
- }
-
- @Override
- protected Selectable select(Selector selector, List strings) {
- initDocument();
- List results = new ArrayList();
- for (String string : strings) {
- String result = selector.select(string);
- if (result != null) {
- results.add(result);
- }
- }
- return new Html(results, false);
- }
-
- @Override
- protected Selectable selectList(Selector selector, List strings) {
- initDocument();
- List results = new ArrayList();
- for (String string : strings) {
- List result = selector.selectList(string);
- results.addAll(result);
- }
- return new Html(results, false);
- }
-
- @Override
- public Selectable smartContent() {
- initDocument();
- SmartContentSelector smartContentSelector = Selectors.smartContent();
- return select(smartContentSelector, strings);
- }
-
- @Override
- public Selectable links() {
- return xpath("//a/@href");
- }
-
- @Override
- public Selectable xpath(String xpath) {
- XpathSelector xpathSelector = Selectors.xpath(xpath);
- if (document != null) {
- return new Html(xpathSelector.selectList(document), false);
- }
- return selectList(xpathSelector, strings);
- }
-
- @Override
- public Selectable $(String selector) {
- CssSelector cssSelector = Selectors.$(selector);
- if (document != null) {
- return new Html(cssSelector.selectList(document), false);
- }
- return selectList(cssSelector, strings);
- }
-
- @Override
- public Selectable $(String selector, String attrName) {
- CssSelector cssSelector = Selectors.$(selector, attrName);
- if (document != null) {
- return new Html(cssSelector.selectList(document), false);
- }
- return selectList(cssSelector, strings);
- }
-
public Document getDocument() {
- initDocument();
return document;
}
- public String getText() {
- if (strings != null && strings.size() > 0) {
- return strings.get(0);
- }
- return document.html();
+ @Override
+ protected List getElements() {
+ return Collections.singletonList(getDocument());
}
/**
@@ -151,7 +55,7 @@ public class Html extends PlainText {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument());
} else {
- return selector.select(getText());
+ return selector.select(getFirstSourceText());
}
}
@@ -160,7 +64,12 @@ public class Html extends PlainText {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument());
} else {
- return selector.selectList(getText());
+ return selector.selectList(getFirstSourceText());
}
}
+
+ public static Html create(String text) {
+ return new Html(text);
+ }
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
new file mode 100644
index 0000000..e41267b
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
@@ -0,0 +1,125 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ListIterator;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class HtmlNode extends AbstractSelectable {
+
+ private final List elements;
+
+ public HtmlNode(List elements) {
+ this.elements = elements;
+ }
+
+ public HtmlNode() {
+ elements = null;
+ }
+
+ protected List getElements() {
+ return elements;
+ }
+
+ @Override
+ public Selectable smartContent() {
+ SmartContentSelector smartContentSelector = Selectors.smartContent();
+ return select(smartContentSelector, getSourceTexts());
+ }
+
+ @Override
+ public Selectable links() {
+ return xpath("//a/@href");
+ }
+
+ @Override
+ public Selectable xpath(String xpath) {
+ XpathSelector xpathSelector = Selectors.xpath(xpath);
+ return selectElements(xpathSelector);
+ }
+
+ /**
+ * select elements
+ *
+ * @param elementSelector
+ * @return
+ */
+ protected Selectable selectElements(BaseElementSelector elementSelector) {
+ ListIterator elementIterator = getElements().listIterator();
+ if (!elementSelector.hasAttribute()) {
+ List resultElements = new ArrayList();
+ while (elementIterator.hasNext()) {
+ Element element = checkElementAndConvert(elementIterator);
+ List selectElements = elementSelector.selectElements(element);
+ resultElements.addAll(selectElements);
+ }
+ return new HtmlNode(resultElements);
+ } else {
+ // has attribute, consider as plaintext
+ List resultStrings = new ArrayList();
+ while (elementIterator.hasNext()) {
+ Element element = checkElementAndConvert(elementIterator);
+ List selectList = elementSelector.selectList(element);
+ resultStrings.addAll(selectList);
+ }
+ return new PlainText(resultStrings);
+
+ }
+ }
+
+ /**
+ * Only document can be select
+ * See: https://github.com/code4craft/webmagic/issues/113
+ *
+ * @param elementIterator
+ * @param element
+ */
+ private Element checkElementAndConvert(ListIterator elementIterator) {
+ Element element = elementIterator.next();
+ if (!(element instanceof Document)) {
+ Document root = new Document(element.ownerDocument().baseUri());
+ Element clone = element.clone();
+ root.appendChild(clone);
+ elementIterator.set(root);
+ return root;
+ }
+ return element;
+ }
+
+ @Override
+ public Selectable $(String selector) {
+ CssSelector cssSelector = Selectors.$(selector);
+ return selectElements(cssSelector);
+ }
+
+ @Override
+ public Selectable $(String selector, String attrName) {
+ CssSelector cssSelector = Selectors.$(selector, attrName);
+ return selectElements(cssSelector);
+ }
+
+ @Override
+ public List nodes() {
+ List selectables = new ArrayList();
+ for (Element element : getElements()) {
+ List childElements = new ArrayList(1);
+ childElements.add(element);
+ selectables.add(new HtmlNode(childElements));
+ }
+ return selectables;
+ }
+
+ @Override
+ protected List getSourceTexts() {
+ List sourceTexts = new ArrayList(getElements().size());
+ for (Element element : getElements()) {
+ sourceTexts.add(element.toString());
+ }
+ return sourceTexts;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
index ef45d00..4c31eb4 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
@@ -1,7 +1,7 @@
package us.codecraft.webmagic.selector;
import com.alibaba.fastjson.JSON;
-import org.jsoup.parser.TokenQueue;
+import us.codecraft.xsoup.XTokenQueue;
import java.util.List;
@@ -26,39 +26,32 @@ public class Json extends PlainText {
* @return
*/
public Json removePadding(String padding) {
- String text = getText();
- TokenQueue tokenQueue = new TokenQueue(text);
+ String text = getFirstSourceText();
+ XTokenQueue tokenQueue = new XTokenQueue(text);
tokenQueue.consumeWhitespace();
tokenQueue.consume(padding);
tokenQueue.consumeWhitespace();
- String chompBalanced = tokenQueue.chompBalanced('(', ')');
+ String chompBalanced = tokenQueue.chompBalancedNotInQuotes('(', ')');
return new Json(chompBalanced);
}
public T toObject(Class clazz) {
- if (getText() == null) {
+ if (getFirstSourceText() == null) {
return null;
}
- return JSON.parseObject(getText(), clazz);
+ return JSON.parseObject(getFirstSourceText(), clazz);
}
public List toList(Class clazz) {
- if (getText() == null) {
+ if (getFirstSourceText() == null) {
return null;
}
- return JSON.parseArray(getText(), clazz);
- }
-
- public String getText() {
- if (strings != null && strings.size() > 0) {
- return strings.get(0);
- }
- return null;
+ return JSON.parseArray(getFirstSourceText(), clazz);
}
@Override
public Selectable jsonPath(String jsonPath) {
JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath);
- return selectList(jsonPathSelector,strings);
+ return selectList(jsonPathSelector,getSourceTexts());
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
index f9083a8..b0b90f9 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
@@ -46,9 +46,12 @@ public class JsonPathSelector implements Selector {
return list;
}
if (object instanceof List) {
- return (List) object;
+ List