diff --git a/.gitignore b/.gitignore index d7d63fe..0175dba 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ out/ .idea .classpath .project +.settings/ +bin/ +.myeclipse diff --git a/README.md b/README.md index b23bf83..deb17d4 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.5.1 + 0.5.2 us.codecraft webmagic-extension - 0.5.1 + 0.5.2 ``` diff --git a/pom.xml b/pom.xml index c17e1a3..d5a107e 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.5.1 + 0.5.2 4.0.0 pom @@ -88,13 +88,25 @@ us.codecraft xsoup - 0.2.2 + 0.2.4 com.alibaba fastjson 1.1.37 + + com.github.dreamhead + moco-core + 0.9.1 + test + + + org.slf4j + slf4j-simple + + + log4j log4j @@ -230,22 +242,44 @@ - release-sign-artifacts - - - performRelease - true - - + release + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + package + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + package + + jar + + + + + org.apache.maven.plugins maven-gpg-plugin - 1.1 + 1.5 - sign-artifacts verify sign @@ -253,10 +287,29 @@ + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6 + true + + sonatype-nexus-staging + https://oss.sonatype.org/ + true + + + + + sonatype-nexus-snapshots + https://oss.sonatype.org/content/repositories/snapshots/ + + + sonatype-nexus-staging + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + - - diff --git a/webmagic-avalon/pom.xml b/webmagic-avalon/pom.xml index 0dbb369..3653649 100644 --- a/webmagic-avalon/pom.xml +++ b/webmagic-avalon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1-SNAPSHOT + 0.5.2-SNAPSHOT-SNAPSHOT 4.0.0 diff --git a/webmagic-avalon/webmagic-admin/pom.xml b/webmagic-avalon/webmagic-admin/pom.xml index ed364c1..ec73349 100644 --- a/webmagic-avalon/webmagic-admin/pom.xml +++ b/webmagic-avalon/webmagic-admin/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.1-SNAPSHOT + 0.5.2-SNAPSHOT-SNAPSHOT 4.0.0 diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index 9c7199a..cec0b83 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.1-SNAPSHOT + 0.5.2-SNAPSHOT-SNAPSHOT 4.0.0 diff --git a/webmagic-avalon/webmagic-worker/pom.xml b/webmagic-avalon/webmagic-worker/pom.xml index ebc5174..1d5df01 100644 --- a/webmagic-avalon/webmagic-worker/pom.xml +++ b/webmagic-avalon/webmagic-worker/pom.xml @@ -3,7 +3,7 @@ webmagic-avalon us.codecraft - 0.5.1-SNAPSHOT + 0.5.2-SNAPSHOT-SNAPSHOT 4.0.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 6973109..85e9c4a 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.1 + 0.5.2 4.0.0 @@ -35,6 +35,11 @@ xsoup + + com.github.dreamhead + moco-core + + org.slf4j slf4j-api diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 1f8a194..9a0321e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -18,6 +18,8 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; + public static final String STATUS_CODE = "statusCode"; + public static final String PROXY = "proxy"; private String url; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index a7c7bf8..01a4c75 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -3,6 +3,8 @@ package us.codecraft.webmagic; import com.google.common.collect.HashBasedTable; import com.google.common.collect.Table; import org.apache.http.HttpHost; + +import us.codecraft.webmagic.proxy.ProxyPool; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -47,6 +49,8 @@ public class Site { private HttpHost httpProxy; + private ProxyPool httpProxyPool=new ProxyPool(); + private boolean useGzip = true; /** @@ -438,4 +442,32 @@ public class Site { ", headers=" + headers + '}'; } + + /** + * Set httpProxyPool, String[0]:ip, String[1]:port
+ * + * @return this + */ + public Site setHttpProxyPool(List httpProxyList) { + this.httpProxyPool=new ProxyPool(httpProxyList); + return this; + } + + public ProxyPool getHttpProxyPool() { + return httpProxyPool; + } + + public HttpHost getHttpProxyFromPool() { + return httpProxyPool.getProxy(); + } + + public void returnHttpProxyToPool(HttpHost proxy,int statusCode) { + httpProxyPool.returnProxy(proxy,statusCode); + } + + public Site setProxyReuseInterval(int reuseInterval) { + this.httpProxyPool.setReuseInterval(reuseInterval); + return this; + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 81cf179..6f6453b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic; import com.google.common.collect.Lists; import org.apache.commons.collections.CollectionUtils; +import org.apache.http.HttpHost; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; @@ -324,6 +325,10 @@ public class Spider implements Runnable, Task { onError(requestFinal); logger.error("process request " + requestFinal + " error", e); } finally { + if (site.getHttpProxyPool().isEnable()) { + site.returnHttpProxyToPool((HttpHost) requestFinal.getExtra(Request.PROXY), (Integer) requestFinal + .getExtra(Request.STATUS_CODE)); + } pageCount.incrementAndGet(); signalNewUrl(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index eeae70e..bdafea7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; @@ -12,17 +14,22 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; +import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -79,16 +86,13 @@ public class HttpClientDownloader extends AbstractDownloader { } logger.info("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; + int statusCode=0; try { HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers); httpResponse = getHttpClient(site).execute(httpUriRequest); - int statusCode = httpResponse.getStatusLine().getStatusCode(); + statusCode = httpResponse.getStatusLine().getStatusCode(); + request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { - //charset - if (charset == null) { - String value = httpResponse.getEntity().getContentType().getValue(); - charset = UrlUtils.getCharset(value); - } Page page = handleResponse(request, charset, httpResponse, task); onSuccess(request); return page; @@ -104,6 +108,7 @@ public class HttpClientDownloader extends AbstractDownloader { onError(request); return null; } finally { + request.putExtra(Request.STATUS_CODE, statusCode); try { if (httpResponse != null) { //ensure the connection is released back to pool @@ -136,9 +141,11 @@ public class HttpClientDownloader extends AbstractDownloader { .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); - if (site != null && site.getHttpProxy() != null) { - requestConfigBuilder.setProxy(site.getHttpProxy()); - } + if (site.getHttpProxyPool().isEnable()) { + HttpHost host = site.getHttpProxyFromPool(); + requestConfigBuilder.setProxy(host); + request.putExtra(Request.PROXY, host); + } requestBuilder.setConfig(requestConfigBuilder.build()); return requestBuilder.build(); } @@ -168,7 +175,7 @@ public class HttpClientDownloader extends AbstractDownloader { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); + String content = getContent(charset, httpResponse); Page page = new Page(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); @@ -176,4 +183,57 @@ public class HttpClientDownloader extends AbstractDownloader { page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; } + + protected String getContent(String charset, HttpResponse httpResponse) throws IOException { + if (charset == null) { + byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + String htmlCharset = getHtmlCharset(httpResponse, contentBytes); + if (htmlCharset != null) { + return new String(contentBytes, htmlCharset); + } else { + logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + return new String(contentBytes); + } + } else { + return IOUtils.toString(httpResponse.getEntity().getContent(), charset); + } + } + + protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { + String charset; + // charset + // 1、encoding in http header Content-Type + String value = httpResponse.getEntity().getContentType().getValue(); + charset = UrlUtils.getCharset(value); + if (StringUtils.isNotBlank(charset)) { + logger.debug("Auto get charset: {}", charset); + return charset; + } + // use default charset to decode first time + Charset defaultCharset = Charset.defaultCharset(); + String content = new String(contentBytes, defaultCharset.name()); + // 2、charset in meta + if (StringUtils.isNotEmpty(content)) { + Document document = Jsoup.parse(content); + Elements links = document.select("meta"); + for (Element link : links) { + // 2.1、html4.01 + String metaContent = link.attr("content"); + String metaCharset = link.attr("charset"); + if (metaContent.indexOf("charset") != -1) { + metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + charset = metaContent.split("=")[1]; + break; + } + // 2.2、html5 + else if (StringUtils.isNotEmpty(metaCharset)) { + charset = metaCharset; + break; + } + } + } + logger.debug("Auto get charset: {}", charset); + // 3、todo use tools as cpdetector for content decode + return charset; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 8eab426..57d6eea 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -4,12 +4,14 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; -import java.io.FileWriter; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Map; @@ -39,7 +41,7 @@ public class FilePipeline extends FilePersistentBase implements Pipeline { public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { - PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"))); + PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java new file mode 100644 index 0000000..27e6b52 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -0,0 +1,172 @@ +package us.codecraft.webmagic.proxy; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Delayed; +import java.util.concurrent.TimeUnit; + +import org.apache.http.HttpHost; + +/** + * >>>>Proxy Status + +----------+ +-----+ + | last use | | new | + +-----+----+ +---+-+ + | +------+ | + +->| init |<--+ + +--+---+ + | + v + +--------+ + +--->| borrow | + | +---+----+ + | |+------------------+ + | v + | +--------+ + | | in use | Respone Time + | +---+----+ + | |+------------------+ + | v + | +--------+ + | | return | + | +---+----+ + | |+-------------------+ + | v + | +-------+ reuse interval + | | delay | (delay time) + | +---+---+ + | |+-------------------+ + | v + | +------+ + | | idle | idle time + | +---+--+ + | |+-------------------+ + +--------+ + */ +public class Proxy implements Delayed, Serializable { + + private static final long serialVersionUID = 228939737383625551L; + public static final int ERROR_403 = 403; + public static final int ERROR_404 = 404; + public static final int ERROR_BANNED = 10000; + public static final int ERROR_Proxy = 10001; + public static final int SUCCESS = 200; + + private final HttpHost httpHost; + + private int reuseTimeInterval = 1500;// ms + private Long canReuseTime = 0L; + private Long lastBorrowTime = System.currentTimeMillis(); + private Long responseTime = 0L; + private Long idleTime = 0L; + + private int failedNum = 0; + private int successNum = 0; + private int borrowNum = 0; + + private List failedErrorType = new ArrayList(); + + Proxy(HttpHost httpHost) { + this.httpHost = httpHost; + this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); + } + + Proxy(HttpHost httpHost, int reuseInterval) { + this.httpHost = httpHost; + this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS); + } + + public int getSuccessNum() { + return successNum; + } + + public void successNumIncrement(int increment) { + this.successNum += increment; + } + + public Long getLastUseTime() { + return lastBorrowTime; + } + + public void setLastBorrowTime(Long lastBorrowTime) { + this.lastBorrowTime = lastBorrowTime; + } + + public void recordResponse() { + this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2; + this.lastBorrowTime = System.currentTimeMillis(); + } + + public List getFailedErrorType() { + return failedErrorType; + } + + public void setFailedErrorType(List failedErrorType) { + this.failedErrorType = failedErrorType; + } + + public void fail(int failedErrorType) { + this.failedNum++; + this.failedErrorType.add(failedErrorType); + } + + public void setFailedNum(int failedNum) { + this.failedNum = failedNum; + } + + public int getFailedNum() { + return failedNum; + } + + public String getFailedType() { + String re = ""; + for (Integer i : this.failedErrorType) { + re += i + " . "; + } + return re; + } + + public HttpHost getHttpHost() { + return httpHost; + } + + public int getReuseTimeInterval() { + return reuseTimeInterval; + } + + public void setReuseTimeInterval(int reuseTimeInterval) { + this.reuseTimeInterval = reuseTimeInterval; + this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS); + + } + + @Override + public long getDelay(TimeUnit unit) { + return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS); + } + + @Override + public int compareTo(Delayed o) { + Proxy that = (Proxy) o; + return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0); + + } + + @Override + public String toString() { + + String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime, + successNum * 100.0 / borrowNum, borrowNum); + return re; + + } + + public void borrowNumIncrement(int increment) { + this.borrowNum += increment; + } + + public int getBorrowNum() { + return borrowNum; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java new file mode 100644 index 0000000..73c5ed6 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -0,0 +1,290 @@ +package us.codecraft.webmagic.proxy; + +import org.apache.http.HttpHost; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.*; +import java.util.Map.Entry; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.DelayQueue; + +/** + * ClassName:ProxyPool + * + * @see + * @Function: TODO ADD FUNCTION + * @author ch + * @version Ver 1.0 + * @Date 2014-2-14 下午01:10:04 + */ +public class ProxyPool { + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private BlockingQueue proxyQueue = new DelayQueue(); + private Map allProxy = new ConcurrentHashMap(); + + private int reuseInterval = 1500;// ms + private int reviveTime = 2 * 60 * 60 * 1000;// ms + + private boolean isEnable = false; + private boolean validateWhenInit = false; + private String proxyFile = "data/lastUse.proxy"; + + private Timer timer = new Timer(true); + private TimerTask saveProxyTask = new TimerTask() { + + @Override + public void run() { + saveProxyList(); + logger.info(allProxyStatus()); + } + }; + + public ProxyPool() { + + } + + public ProxyPool(List httpProxyList) { + readProxyList(); + addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); + timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000); + } + + private void saveProxyList() { + if (allProxy.size() == 0) { + return; + } + try { + ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile)); + os.writeObject(prepareForSaving()); + os.close(); + logger.info("save proxy"); + } catch (FileNotFoundException e) { + logger.error("proxy file not found", e); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private Map prepareForSaving() { + Map tmp = new HashMap(); + for (Entry e : allProxy.entrySet()) { + Proxy p = e.getValue(); + p.setFailedNum(0); + tmp.put(e.getKey(), p); + } + return tmp; + } + + private void readProxyList() { + try { + ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile)); + addProxy((Map) is.readObject()); + is.close(); + } catch (FileNotFoundException e) { + logger.error("proxy file not found", e); + } catch (IOException e) { + e.printStackTrace(); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + } + + private void addProxy(Map httpProxyMap) { + isEnable = true; + for (Entry entry : httpProxyMap.entrySet()) { + try { + if (allProxy.containsKey(entry.getKey())) { + continue; + } + if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) { + entry.getValue().setFailedNum(0); + entry.getValue().setReuseTimeInterval(reuseInterval); + proxyQueue.add(entry.getValue()); + allProxy.put(entry.getKey(), entry.getValue()); + } + } catch (NumberFormatException e) { + logger.error("HttpHost init error:", e); + } + } + logger.info("proxy pool size>>>>" + allProxy.size()); + } + + public void addProxy(String[]... httpProxyList) { + isEnable = true; + for (String[] s : httpProxyList) { + try { + if (allProxy.containsKey(s[0])) { + continue; + } + HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); + if (!validateWhenInit || ProxyUtil.validateProxy(item)) { + Proxy p = new Proxy(item, reuseInterval); + proxyQueue.add(p); + allProxy.put(s[0], p); + } + } catch (NumberFormatException e) { + logger.error("HttpHost init error:", e); + } catch (UnknownHostException e) { + logger.error("HttpHost init error:", e); + } + } + logger.info("proxy pool size>>>>" + allProxy.size()); + } + + public HttpHost getProxy() { + Proxy proxy = null; + try { + Long time = System.currentTimeMillis(); + proxy = proxyQueue.take(); + double costTime = (System.currentTimeMillis() - time) / 1000.0; + if (costTime > reuseInterval) { + logger.info("get proxy time >>>> " + costTime); + } + Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); + p.setLastBorrowTime(System.currentTimeMillis()); + p.borrowNumIncrement(1); + } catch (InterruptedException e) { + logger.error("get proxy error", e); + } + if (proxy == null) { + throw new NoSuchElementException(); + } + return proxy.getHttpHost(); + } + + public void returnProxy(HttpHost host, int statusCode) { + Proxy p = allProxy.get(host.getAddress().getHostAddress()); + if (p == null) { + return; + } + switch (statusCode) { + case Proxy.SUCCESS: + p.setReuseTimeInterval(reuseInterval); + p.setFailedNum(0); + p.setFailedErrorType(new ArrayList()); + p.recordResponse(); + p.successNumIncrement(1); + break; + case Proxy.ERROR_403: + // banned,try larger interval + p.fail(Proxy.ERROR_403); + p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); + logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + break; + case Proxy.ERROR_BANNED: + p.fail(Proxy.ERROR_BANNED); + p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); + logger.warn("this proxy is banned >>>> " + p.getHttpHost()); + logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + break; + case Proxy.ERROR_404: + //p.fail(Proxy.ERROR_404); + // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); + break; + default: + p.fail(statusCode); + break; + } + if (p.getFailedNum() > 20) { + // allProxy.remove(host.getAddress().getHostAddress()); + p.setReuseTimeInterval(reviveTime); + logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + return; + } + if (p.getFailedNum()%5==0) { + if (!ProxyUtil.validateProxy(host)) { + // allProxy.remove(host.getAddress().getHostAddress()); + p.setReuseTimeInterval(reviveTime); + logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + return; + } + } + try { + proxyQueue.put(p); + } catch (InterruptedException e) { + logger.warn("proxyQueue return proxy error", e); + } + } + + public String allProxyStatus() { + String re = "all proxy info >>>> \n"; + for (Entry entry : allProxy.entrySet()) { + re += entry.getValue().toString() + "\n"; + } + return re; + + } + + public int getIdleNum() { + return proxyQueue.size(); + } + + public int getReuseInterval() { + return reuseInterval; + } + + public void setReuseInterval(int reuseInterval) { + this.reuseInterval = reuseInterval; + } + + public static List getProxyList() { + List proxyList = new ArrayList(); + BufferedReader br = null; + try { + br = new BufferedReader(new FileReader(new File("proxy.txt"))); + + String line = ""; + while ((line = br.readLine()) != null) { + proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] }); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return proxyList; + } + + public static void main(String[] args) throws IOException { + ProxyPool proxyPool = new ProxyPool(getProxyList()); + proxyPool.setReuseInterval(10000); + // proxyPool.saveProxyList(); + + while (true) { + List httphostList = new ArrayList(); + System.in.read(); + int i = 0; + while (proxyPool.getIdleNum() > 2) { + HttpHost httphost = proxyPool.getProxy(); + httphostList.add(httphost); + // proxyPool.proxyPool.use(httphost); + proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString()); + i++; + } + System.out.println(proxyPool.allProxyStatus()); + System.in.read(); + for (i = 0; i < httphostList.size(); i++) { + proxyPool.returnProxy(httphostList.get(i), 200); + proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString()); + } + System.out.println(proxyPool.allProxyStatus()); + System.in.read(); + } + + } + + public void enable(boolean isEnable) { + this.isEnable = isEnable; + } + + public boolean isEnable() { + return isEnable; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java new file mode 100644 index 0000000..f045e0d --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java @@ -0,0 +1,101 @@ +package us.codecraft.webmagic.proxy; + +import java.io.IOException; +import java.net.Inet6Address; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.NetworkInterface; +import java.net.Socket; +import java.net.SocketException; +import java.util.Enumeration; + +import org.apache.http.HttpHost; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * ClassName:ProxyUtil + * + * @see + * @author ch + * @version Ver 1.0 + * @Date 2014-2-16 下午04:20:07 + */ +public class ProxyUtil { + // TODO 改为单例 + private static InetAddress localAddr; + private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class); + static { + init(); + } + + private static void init() { + Enumeration localAddrs; + try { + NetworkInterface ni = NetworkInterface.getByName("eth7"); + if (ni == null) { + logger.error("choose NetworkInterface\n" + getNetworkInterface()); + } + localAddrs = ni.getInetAddresses(); + while (localAddrs.hasMoreElements()) { + InetAddress tmp = localAddrs.nextElement(); + if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) { + localAddr = tmp; + logger.info("local IP:" + localAddr.getHostAddress()); + break; + } + } + } catch (Exception e) { + logger.error("Failure when init ProxyUtil", e); + logger.error("choose NetworkInterface\n" + getNetworkInterface()); + } + + } + + public static boolean validateProxy(HttpHost p) { + if (localAddr == null) { + logger.error("cannot get local ip"); + return false; + } + boolean isReachable = false; + Socket socket = null; + try { + socket = new Socket(); + socket.bind(new InetSocketAddress(localAddr, 0)); + InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort()); + socket.connect(endpointSocketAddr, 3000); + logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p); + isReachable = true; + } catch (IOException e) { + logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p); + } finally { + if (socket != null) { + try { + socket.close(); + } catch (IOException e) { + logger.warn("Error occurred while closing socket of validating proxy", e); + } + } + } + return isReachable; + } + + private static String getNetworkInterface() { + String networkInterfaceName = ""; + Enumeration enumeration = null; + try { + enumeration = NetworkInterface.getNetworkInterfaces(); + } catch (SocketException e1) { + e1.printStackTrace(); + } + while (enumeration.hasMoreElements()) { + NetworkInterface networkInterface = enumeration.nextElement(); + networkInterfaceName += networkInterface.toString() + '\n'; + Enumeration addr = networkInterface.getInetAddresses(); + while (addr.hasMoreElements()) { + networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n"; + } + } + return networkInterfaceName; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java new file mode 100644 index 0000000..e2bb552 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -0,0 +1,109 @@ +package us.codecraft.webmagic.selector; + +import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafer@gmail.com + * @since 0.5.2 + */ +public abstract class AbstractSelectable implements Selectable { + + protected abstract List getSourceTexts(); + + @Override + public Selectable css(String selector) { + return $(selector); + } + + @Override + public Selectable css(String selector, String attrName) { + return $(selector, attrName); + } + + protected Selectable select(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + String result = selector.select(string); + if (result != null) { + results.add(result); + } + } + return new PlainText(results); + } + + protected Selectable selectList(Selector selector, List strings) { + List results = new ArrayList(); + for (String string : strings) { + List result = selector.selectList(string); + results.addAll(result); + } + return new PlainText(results); + } + + @Override + public List all() { + return getSourceTexts(); + } + + @Override + public Selectable jsonPath(String jsonPath) { + throw new UnsupportedOperationException(); + } + + @Override + public String get() { + if (CollectionUtils.isNotEmpty(all())) { + return all().get(0); + } else { + return null; + } + } + + @Override + public Selectable select(Selector selector) { + return select(selector, getSourceTexts()); + } + + @Override + public Selectable selectList(Selector selector) { + return selectList(selector, getSourceTexts()); + } + + @Override + public Selectable regex(String regex) { + RegexSelector regexSelector = Selectors.regex(regex); + return selectList(regexSelector, getSourceTexts()); + } + + @Override + public Selectable regex(String regex, int group) { + RegexSelector regexSelector = Selectors.regex(regex, group); + return selectList(regexSelector, getSourceTexts()); + } + + @Override + public Selectable replace(String regex, String replacement) { + ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); + return select(replaceSelector, getSourceTexts()); + } + + public String getFirstSourceText() { + if (getSourceTexts() != null && getSourceTexts().size() > 0) { + return getSourceTexts().get(0); + } + return null; + } + + @Override + public String toString() { + return get(); + } + + @Override + public boolean match() { + return getSourceTexts() != null && getSourceTexts().size() > 0; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index 7d9035f..bbc7217 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; +import org.jsoup.nodes.Element; import java.util.ArrayList; import java.util.List; @@ -28,4 +29,25 @@ public abstract class BaseElementSelector implements Selector, ElementSelector { } } + public Element selectElement(String text) { + if (text != null) { + return selectElement(Jsoup.parse(text)); + } + return null; + } + + public List selectElements(String text) { + if (text != null) { + return selectElements(Jsoup.parse(text)); + } else { + return new ArrayList(); + } + } + + public abstract Element selectElement(Element element); + + public abstract List selectElements(Element element); + + public abstract boolean hasAttribute(); + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 185db74..6a638db 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector { @Override public String select(Element element) { - Elements elements = element.select(selectorText); + List elements = selectElements(element); if (CollectionUtils.isEmpty(elements)) { return null; } @@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector { @Override public List selectList(Element doc) { List strings = new ArrayList(); - Elements elements = doc.select(selectorText); + List elements = selectElements(doc); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { String value = getValue(element); @@ -78,4 +78,23 @@ public class CssSelector extends BaseElementSelector { } return strings; } + + @Override + public Element selectElement(Element element) { + Elements elements = element.select(selectorText); + if (CollectionUtils.isNotEmpty(elements)) { + return elements.get(0); + } + return null; + } + + @Override + public List selectElements(Element element) { + return element.select(selectorText); + } + + @Override + public boolean hasAttribute() { + return attrName != null; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 34386b5..7b593ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; +import java.util.Collections; import java.util.List; /** @@ -14,7 +15,7 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class Html extends PlainText { +public class Html extends HtmlNode { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -23,123 +24,26 @@ public class Html extends PlainText { */ private Document document; - private boolean needInitCache = true; - - public Html(List strings) { - super(strings); - } - public Html(String text) { - super(text); - } - - public Html(List strings, boolean needInitCache) { - super(strings); - this.needInitCache = needInitCache; - } - - public Html(String text, boolean needInitCache) { - super(text); - this.needInitCache = needInitCache; - } - - /** - * lazy init - */ - private void initDocument() { - if (this.document == null && needInitCache) { - needInitCache = false; - //just init once whether the parsing succeeds or not - try { - this.document = Jsoup.parse(getText()); - } catch (Exception e) { - logger.warn("parse document error ", e); - } + try { + this.document = Jsoup.parse(text); + } catch (Exception e) { + this.document = null; + logger.warn("parse document error ", e); } } public Html(Document document) { - super(document.html()); this.document = document; } - public static Html create(String text) { - return new Html(text); - } - - @Override - protected Selectable select(Selector selector, List strings) { - initDocument(); - List results = new ArrayList(); - for (String string : strings) { - String result = selector.select(string); - if (result != null) { - results.add(result); - } - } - return new Html(results, false); - } - - @Override - protected Selectable selectList(Selector selector, List strings) { - initDocument(); - List results = new ArrayList(); - for (String string : strings) { - List result = selector.selectList(string); - results.addAll(result); - } - return new Html(results, false); - } - - @Override - public Selectable smartContent() { - initDocument(); - SmartContentSelector smartContentSelector = Selectors.smartContent(); - return select(smartContentSelector, strings); - } - - @Override - public Selectable links() { - return xpath("//a/@href"); - } - - @Override - public Selectable xpath(String xpath) { - XpathSelector xpathSelector = Selectors.xpath(xpath); - if (document != null) { - return new Html(xpathSelector.selectList(document), false); - } - return selectList(xpathSelector, strings); - } - - @Override - public Selectable $(String selector) { - CssSelector cssSelector = Selectors.$(selector); - if (document != null) { - return new Html(cssSelector.selectList(document), false); - } - return selectList(cssSelector, strings); - } - - @Override - public Selectable $(String selector, String attrName) { - CssSelector cssSelector = Selectors.$(selector, attrName); - if (document != null) { - return new Html(cssSelector.selectList(document), false); - } - return selectList(cssSelector, strings); - } - public Document getDocument() { - initDocument(); return document; } - public String getText() { - if (strings != null && strings.size() > 0) { - return strings.get(0); - } - return document.html(); + @Override + protected List getElements() { + return Collections.singletonList(getDocument()); } /** @@ -151,7 +55,7 @@ public class Html extends PlainText { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.select(getDocument()); } else { - return selector.select(getText()); + return selector.select(getFirstSourceText()); } } @@ -160,7 +64,12 @@ public class Html extends PlainText { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.selectList(getDocument()); } else { - return selector.selectList(getText()); + return selector.selectList(getFirstSourceText()); } } + + public static Html create(String text) { + return new Html(text); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java new file mode 100644 index 0000000..e41267b --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -0,0 +1,125 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; +import java.util.ListIterator; + +/** + * @author code4crafer@gmail.com + */ +public class HtmlNode extends AbstractSelectable { + + private final List elements; + + public HtmlNode(List elements) { + this.elements = elements; + } + + public HtmlNode() { + elements = null; + } + + protected List getElements() { + return elements; + } + + @Override + public Selectable smartContent() { + SmartContentSelector smartContentSelector = Selectors.smartContent(); + return select(smartContentSelector, getSourceTexts()); + } + + @Override + public Selectable links() { + return xpath("//a/@href"); + } + + @Override + public Selectable xpath(String xpath) { + XpathSelector xpathSelector = Selectors.xpath(xpath); + return selectElements(xpathSelector); + } + + /** + * select elements + * + * @param elementSelector + * @return + */ + protected Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = getElements().listIterator(); + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator + * @param element + */ + private Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } + + @Override + public Selectable $(String selector) { + CssSelector cssSelector = Selectors.$(selector); + return selectElements(cssSelector); + } + + @Override + public Selectable $(String selector, String attrName) { + CssSelector cssSelector = Selectors.$(selector, attrName); + return selectElements(cssSelector); + } + + @Override + public List nodes() { + List selectables = new ArrayList(); + for (Element element : getElements()) { + List childElements = new ArrayList(1); + childElements.add(element); + selectables.add(new HtmlNode(childElements)); + } + return selectables; + } + + @Override + protected List getSourceTexts() { + List sourceTexts = new ArrayList(getElements().size()); + for (Element element : getElements()) { + sourceTexts.add(element.toString()); + } + return sourceTexts; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java index ef45d00..4c31eb4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.selector; import com.alibaba.fastjson.JSON; -import org.jsoup.parser.TokenQueue; +import us.codecraft.xsoup.XTokenQueue; import java.util.List; @@ -26,39 +26,32 @@ public class Json extends PlainText { * @return */ public Json removePadding(String padding) { - String text = getText(); - TokenQueue tokenQueue = new TokenQueue(text); + String text = getFirstSourceText(); + XTokenQueue tokenQueue = new XTokenQueue(text); tokenQueue.consumeWhitespace(); tokenQueue.consume(padding); tokenQueue.consumeWhitespace(); - String chompBalanced = tokenQueue.chompBalanced('(', ')'); + String chompBalanced = tokenQueue.chompBalancedNotInQuotes('(', ')'); return new Json(chompBalanced); } public T toObject(Class clazz) { - if (getText() == null) { + if (getFirstSourceText() == null) { return null; } - return JSON.parseObject(getText(), clazz); + return JSON.parseObject(getFirstSourceText(), clazz); } public List toList(Class clazz) { - if (getText() == null) { + if (getFirstSourceText() == null) { return null; } - return JSON.parseArray(getText(), clazz); - } - - public String getText() { - if (strings != null && strings.size() > 0) { - return strings.get(0); - } - return null; + return JSON.parseArray(getFirstSourceText(), clazz); } @Override public Selectable jsonPath(String jsonPath) { JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath); - return selectList(jsonPathSelector,strings); + return selectList(jsonPathSelector,getSourceTexts()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index f9083a8..b0b90f9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -46,9 +46,12 @@ public class JsonPathSelector implements Selector { return list; } if (object instanceof List) { - return (List) object; + List items = (List) object; + for (Object item : items) { + list.add(String.valueOf(item)); + } } else { - list.add(object.toString()); + list.add(String.valueOf(object)); } return list; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index efa38d8..557763b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -1,7 +1,5 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; - import java.util.ArrayList; import java.util.List; @@ -12,18 +10,17 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class PlainText implements Selectable { +public class PlainText extends AbstractSelectable { - protected List strings; + protected List sourceTexts; - public PlainText(List strings) { - this.strings = strings; + public PlainText(List sourceTexts) { + this.sourceTexts = sourceTexts; } public PlainText(String text) { - List results = new ArrayList(); - results.add(text); - this.strings = results; + this.sourceTexts = new ArrayList(); + sourceTexts.add(text); } public static PlainText create(String text) { @@ -45,16 +42,6 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } - @Override - public Selectable css(String selector) { - return $(selector); - } - - @Override - public Selectable css(String selector, String attrName) { - return $(selector, attrName); - } - @Override public Selectable smartContent() { throw new UnsupportedOperationException(); @@ -66,79 +53,16 @@ public class PlainText implements Selectable { } @Override - public Selectable regex(String regex) { - RegexSelector regexSelector = Selectors.regex(regex); - return selectList(regexSelector, strings); - } - - @Override - public Selectable regex(String regex, int group) { - RegexSelector regexSelector = Selectors.regex(regex, group); - return selectList(regexSelector, strings); - } - - protected Selectable select(Selector selector, List strings) { - List results = new ArrayList(); - for (String string : strings) { - String result = selector.select(string); - if (result != null) { - results.add(result); - } + public List nodes() { + List nodes = new ArrayList(getSourceTexts().size()); + for (String string : getSourceTexts()) { + nodes.add(PlainText.create(string)); } - return new PlainText(results); - } - - protected Selectable selectList(Selector selector, List strings) { - List results = new ArrayList(); - for (String string : strings) { - List result = selector.selectList(string); - results.addAll(result); - } - return new PlainText(results); + return nodes; } @Override - public Selectable replace(String regex, String replacement) { - ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); - return select(replaceSelector, strings); - } - - @Override - public List all() { - return strings; - } - - @Override - public Selectable jsonPath(String jsonPath) { - throw new UnsupportedOperationException(); - } - - @Override - public String get() { - if (CollectionUtils.isNotEmpty(all())) { - return all().get(0); - } else { - return null; - } - } - - @Override - public Selectable select(Selector selector) { - return select(selector, strings); - } - - @Override - public Selectable selectList(Selector selector) { - return selectList(selector, strings); - } - - @Override - public String toString() { - return get(); - } - - @Override - public boolean match() { - return strings != null && strings.size() > 0; + protected List getSourceTexts() { + return sourceTexts; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 2cc4ed9..341a077 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -143,4 +143,10 @@ public interface Selectable { * @return */ public Selectable selectList(Selector selector); + + /** + * get all nodes + * @return + */ + public List nodes(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index d1bbcae..8a980a5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.selector; +import org.apache.commons.collections.CollectionUtils; import org.jsoup.nodes.Element; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -29,4 +30,23 @@ public class XpathSelector extends BaseElementSelector { public List selectList(Element element) { return xPathEvaluator.evaluate(element).list(); } + + @Override + public Element selectElement(Element element) { + List elements = selectElements(element); + if (CollectionUtils.isNotEmpty(elements)){ + return elements.get(0); + } + return null; + } + + @Override + public List selectElements(Element element) { + return xPathEvaluator.evaluate(element).getElements(); + } + + @Override + public boolean hasAttribute() { + return xPathEvaluator.hasAttribute(); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ab84665..352e49c 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,5 +1,10 @@ package us.codecraft.webmagic.downloader; +import com.github.dreamhead.moco.*; +import com.github.dreamhead.moco.Runnable; +import org.apache.commons.io.IOUtils; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.impl.client.CloseableHttpClient; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Page; @@ -8,9 +13,12 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; +import java.io.IOException; import java.io.UnsupportedEncodingException; +import static com.github.dreamhead.moco.Moco.*; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; /** @@ -31,7 +39,7 @@ public class HttpClientDownloaderTest { public void testDownloader() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Html html = httpClientDownloader.download("https://github.com"); - assertTrue(!html.getText().isEmpty()); + assertTrue(!html.getFirstSourceText().isEmpty()); } @Test(expected = IllegalArgumentException.class) @@ -52,4 +60,54 @@ public class HttpClientDownloaderTest { assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2); } + @Test + public void testGetHtmlCharset() throws Exception { + HttpServer server = httpserver(12306); + server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk")); + server.get(by(uri("/meta4"))).response(with(text("\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "")),header("Content-Type","")); + server.get(by(uri("/meta5"))).response(with(text("\n" + + " \n" + + " \n" + + " \n" + + " \n" + + "")),header("Content-Type","")); + Runner.running(server, new Runnable() { + @Override + public void run() { + String charset = getCharsetByUrl("http://127.0.0.1:12306/header"); + assertEquals(charset, "gbk"); + charset = getCharsetByUrl("http://127.0.0.1:12306/meta4"); + assertEquals(charset, "gbk"); + charset = getCharsetByUrl("http://127.0.0.1:12306/meta5"); + assertEquals(charset, "gbk"); + } + + private String getCharsetByUrl(String url) { + HttpClientDownloader downloader = new HttpClientDownloader(); + Site site = Site.me(); + CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); + // encoding in http header Content-Type + Request requestGBK = new Request(url); + CloseableHttpResponse httpResponse = null; + try { + httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); + } catch (IOException e) { + e.printStackTrace(); + } + String charset = null; + try { + byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); + charset = downloader.getHtmlCharset(httpResponse,contentBytes); + } catch (IOException e) { + e.printStackTrace(); + } + return charset; + } + }); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java new file mode 100644 index 0000000..e420588 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.pipeline; + +import org.junit.BeforeClass; +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +import java.util.UUID; + +/** + * Created by ywooer on 2014/5/6 0006. + */ +public class FilePipelineTest { + + private static ResultItems resultItems; + private static Task task; + + @BeforeClass + public static void before() { + resultItems = new ResultItems(); + resultItems.put("content", "webmagic 爬虫工具"); + Request request = new Request("http://www.baidu.com"); + resultItems.setRequest(request); + + task = new Task() { + @Override + public String getUUID() { + return UUID.randomUUID().toString(); + } + + @Override + public Site getSite() { + return null; + } + }; + } + @Test + public void testProcess() { + FilePipeline filePipeline = new FilePipeline(); + filePipeline.process(resultItems, task); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java index e8da48d..bf9475d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java @@ -1,8 +1,8 @@ package us.codecraft.webmagic.selector; -import junit.framework.Assert; import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; import static us.codecraft.webmagic.selector.Selectors.*; /** @@ -16,19 +16,19 @@ public class ExtractorsTest { @Test public void testEach() { - Assert.assertEquals("aabbcc", $("div h1 a").select(html)); - Assert.assertEquals("xxx", $("div h1 a", "href").select(html)); - Assert.assertEquals("aabbcc", $("div h1 a", "innerHtml").select(html)); - Assert.assertEquals("xxx", xpath("//a/@href").select(html)); - Assert.assertEquals("xxx", regex("a href=\"(.*)\"").select(html)); - Assert.assertEquals("xxx", regex("(a href)=\"(.*)\"", 2).select(html)); + assertThat($("div h1 a").select(html)).isEqualTo("aabbcc"); + assertThat($("div h1 a", "href").select(html)).isEqualTo("xxx"); + assertThat($("div h1 a", "innerHtml").select(html)).isEqualTo("aabbcc"); + assertThat(xpath("//a/@href").select(html)).isEqualTo("xxx"); + assertThat(regex("a href=\"(.*)\"").select(html)).isEqualTo("xxx"); + assertThat(regex("(a href)=\"(.*)\"", 2).select(html)).isEqualTo("xxx"); } @Test public void testCombo() { - Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2)); + assertThat(and($("title"), regex("aa(bb)cc")).select(html2)).isEqualTo("bb"); OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title")); - Assert.assertEquals("aabbcc", or.select(html)); - Assert.assertEquals("aabbcc", or.select(html2)); + assertThat(or.select(html)).isEqualTo("aabbcc"); + assertThat(or.select(html2)).isEqualTo("aabbcc"); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java index c38efe9..9c705b2 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java @@ -1,10 +1,11 @@ package us.codecraft.webmagic.selector; -import junit.framework.Assert; import org.junit.Test; import java.util.List; +import static org.assertj.core.api.Assertions.assertThat; + /** * @author code4crafter@gmai.com
*/ @@ -32,16 +33,16 @@ public class JsonPathSelectorTest { "}"; @Test - public void test() { + public void testJsonPath() { JsonPathSelector jsonPathSelector = new JsonPathSelector("$.store.book[*].author"); String select = jsonPathSelector.select(text); List list = jsonPathSelector.selectList(text); - Assert.assertNotNull(select); - Assert.assertNotNull(list); + assertThat(select).isEqualTo("Nigel Rees"); + assertThat(list).contains("Nigel Rees","Evelyn Waugh"); jsonPathSelector = new JsonPathSelector("$.store.book[?(@.category == 'reference')]"); list = jsonPathSelector.selectList(text); select = jsonPathSelector.select(text); - Assert.assertNotNull(list); - Assert.assertNotNull(select); + assertThat(select).isEqualTo("{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\",\"category\":\"reference\",\"price\":8.95}"); + assertThat(list).contains("{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\",\"category\":\"reference\",\"price\":8.95}"); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java index 89afbb6..6ad2f87 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java @@ -12,9 +12,17 @@ public class JsonTest { private String text = "callback({\"name\":\"json\"})"; + private String textWithBrackerInContent = "callback({\"name\":\"json)\"})"; + @Test public void testRemovePadding() throws Exception { String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); assertThat(name).isEqualTo("json"); } + + @Test + public void testRemovePaddingForQuotes() throws Exception { + String name = new Json(textWithBrackerInContent).removePadding("callback").jsonPath("$.name").get(); + assertThat(name).isEqualTo("json)"); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index a0b8caf..63e8e43 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.selector; -import org.junit.Assert; +import org.assertj.core.api.Assertions; import org.junit.Test; /** @@ -20,6 +20,6 @@ public class RegexSelectorTest { String source = "(hello world"; RegexSelector regexSelector = new RegexSelector(regex); String select = regexSelector.select(source); - Assert.assertEquals(source,select); + Assertions.assertThat(select).isEqualTo(source); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java index 249a837..4ec692d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java @@ -23,4 +23,11 @@ public class SelectorTest { assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall); assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall); } + + @Test + public void testNodes() throws Exception { + Html selectable = new Html(html); + List links = selectable.xpath("//a").nodes(); + assertThat(links.get(0).links().get()).isEqualTo("http://whatever.com/aaa"); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 565fde4..86b9db3 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -49,11 +49,6 @@ public class UrlUtilsTest { assertThat(replacedHtml).isEqualTo(""); } - @Test - public void test(){ - UrlUtils.canonicalizeUrl("start tag", "http://www.dianping.com/"); - } - @Test public void testGetDomain(){ String url = "http://www.dianping.com/aa/"; diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b5ac4a6..eea8fd2 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.5.1 + 0.5.2 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index 427cdf7..738d4a7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -78,4 +78,17 @@ public class GithubRepo implements HasKey { public int getFork() { return fork; } + + @Override + public String toString() { + return "GithubRepo{" + + "name='" + name + '\'' + + ", author='" + author + '\'' + + ", readme='" + readme + '\'' + + ", language=" + language + + ", star=" + star + + ", fork=" + fork + + ", url='" + url + '\'' + + '}'; + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java new file mode 100644 index 0000000..d8bf9fb --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.PageMapper; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +public class GithubRepoPageMapper implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3).setSleepTime(0); + + private PageMapper githubRepoPageMapper = new PageMapper(GithubRepo.class); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); + GithubRepo githubRepo = githubRepoPageMapper.get(page); + if (githubRepo == null) { + page.setSkip(true); + } else { + page.putField("repo", githubRepo); + } + + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new GithubRepoPageMapper()).addUrl("https://github.com/code4craft").thread(5).run(); + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java new file mode 100644 index 0000000..1cc5ac3 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.model; + +import us.codecraft.webmagic.Page; + +import java.util.List; + +/** + * @author code4crafer@gmail.com + * @since 0.5.2 + */ +public class PageMapper { + + private Class clazz; + + private PageModelExtractor pageModelExtractor; + + public PageMapper(Class clazz) { + this.clazz = clazz; + this.pageModelExtractor = PageModelExtractor.create(clazz); + } + + public T get(Page page) { + return (T) pageModelExtractor.process(page); + } + + public List getAll(Page page) { + return (List) pageModelExtractor.process(page); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index b6c55af..e36f920 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -8,7 +8,6 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; -import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; @@ -38,7 +37,7 @@ public class JsonFilePipeline extends FilePersistentBase implements Pipeline { public void process(ResultItems resultItems, Task task) { String path = this.path + "/" + task.getUUID() + "/"; try { - PrintWriter printWriter = new PrintWriter(new FileWriter(new File(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"))); + PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"))); printWriter.write(JSON.toJSONString(resultItems.getAll())); printWriter.close(); } catch (IOException e) { diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index b8dc898..eeea865 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1 + 0.5.2 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java new file mode 100644 index 0000000..22ae5eb --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java @@ -0,0 +1,50 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.samples.pipeline.OneFilePipeline; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; +import us.codecraft.webmagic.selector.Selectable; + +import java.io.FileNotFoundException; +import java.io.UnsupportedEncodingException; +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public class MamacnPageProcessor implements PageProcessor { + + private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100); + + @Override + public void process(Page page) { + List nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes(); + StringBuilder accum = new StringBuilder(); + for (Selectable node : nodes) { + accum.append("img:").append(node.xpath("//a/@href").get()).append("\n"); + accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n"); + } + page.putField("",accum.toString()); + if (accum.length() == 0) { + page.setSkip(true); + } + page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all()); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { + Spider.create(new MamacnPageProcessor()) + .setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn")) + .addUrl("http://www.mama.cn/photo/t1-p1.html") + .addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data")) + .thread(5) + .run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java new file mode 100644 index 0000000..9cb1bc2 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java @@ -0,0 +1,50 @@ +package us.codecraft.webmagic.samples.pipeline; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.utils.FilePersistentBase; + +import java.io.*; +import java.util.Map; + +/** + * @author code4crafer@gmail.com + */ +public class OneFilePipeline extends FilePersistentBase implements Pipeline { + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private PrintWriter printWriter; + + /** + * create a FilePipeline with default path"/data/webmagic/" + */ + public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException { + this("/data/webmagic/"); + } + + public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException { + setPath(path); + printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8")); + } + + @Override + public synchronized void process(ResultItems resultItems, Task task) { + printWriter.println("url:\t" + resultItems.getRequest().getUrl()); + for (Map.Entry entry : resultItems.getAll().entrySet()) { + if (entry.getValue() instanceof Iterable) { + Iterable value = (Iterable) entry.getValue(); + printWriter.println(entry.getKey() + ":"); + for (Object o : value) { + printWriter.println(o); + } + } else { + printWriter.println(entry.getKey() + ":\t" + entry.getValue()); + } + } + printWriter.flush(); + } +} diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 5424536..5ec2336 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1 + 0.5.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 9337dde..74afe65 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1 + 0.5.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 63e5a72..b5bc49e 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.5.1 + 0.5.2 4.0.0 diff --git a/zh_docs/README.md b/zh_docs/README.md index b23bf83..deb17d4 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.5.1 + 0.5.2 us.codecraft webmagic-extension - 0.5.1 + 0.5.2 ```