#424 remove guava dep and add fix docs
parent
c2531c6817
commit
00dfebbceb
10
pom.xml
10
pom.xml
|
@ -70,16 +70,16 @@
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
<version>4.5.2</version>
|
<version>4.5.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
|
||||||
<artifactId>json-path</artifactId>
|
|
||||||
<version>0.8.1</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
<version>15.0</version>
|
<version>15.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
|
<artifactId>json-path</artifactId>
|
||||||
|
<version>0.8.1</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>slf4j-api</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
|
|
|
@ -20,11 +20,6 @@
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>junit</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.guava</groupId>
|
|
||||||
<artifactId>guava</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -73,12 +68,6 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
<exclusions>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>commons-lang</groupId>
|
|
||||||
<artifactId>commons-lang</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
</exclusions>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import com.google.common.collect.HashBasedTable;
|
|
||||||
import com.google.common.collect.Table;
|
|
||||||
import org.apache.http.HttpHost;
|
import org.apache.http.HttpHost;
|
||||||
|
|
||||||
import us.codecraft.webmagic.proxy.Proxy;
|
|
||||||
import us.codecraft.webmagic.proxy.SimpleProxyPool;
|
|
||||||
import org.apache.http.auth.UsernamePasswordCredentials;
|
import org.apache.http.auth.UsernamePasswordCredentials;
|
||||||
|
import us.codecraft.webmagic.proxy.Proxy;
|
||||||
import us.codecraft.webmagic.proxy.ProxyPool;
|
import us.codecraft.webmagic.proxy.ProxyPool;
|
||||||
|
import us.codecraft.webmagic.proxy.SimpleProxyPool;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -27,7 +24,7 @@ public class Site {
|
||||||
|
|
||||||
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
|
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
private Table<String, String, String> cookies = HashBasedTable.create();
|
private Map<String, Map<String, String>> cookies = new HashMap<String, Map<String, String>>();
|
||||||
|
|
||||||
private String charset;
|
private String charset;
|
||||||
|
|
||||||
|
@ -104,7 +101,10 @@ public class Site {
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site addCookie(String domain, String name, String value) {
|
public Site addCookie(String domain, String name, String value) {
|
||||||
cookies.put(domain, name, value);
|
if (!cookies.containsKey(domain)){
|
||||||
|
cookies.put(domain,new HashMap<String, String>());
|
||||||
|
}
|
||||||
|
cookies.get(domain).put(name, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -134,7 +134,7 @@ public class Site {
|
||||||
* @return get cookies
|
* @return get cookies
|
||||||
*/
|
*/
|
||||||
public Map<String,Map<String, String>> getAllCookies() {
|
public Map<String,Map<String, String>> getAllCookies() {
|
||||||
return cookies.rowMap();
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -483,6 +483,7 @@ public class Site {
|
||||||
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
|
||||||
*
|
*
|
||||||
* @param httpProxyList httpProxyList
|
* @param httpProxyList httpProxyList
|
||||||
|
* @param isUseLastProxy isUseLastProxy
|
||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
package us.codecraft.webmagic;
|
package us.codecraft.webmagic;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.apache.http.HttpHost;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import us.codecraft.webmagic.downloader.Downloader;
|
import us.codecraft.webmagic.downloader.Downloader;
|
||||||
|
@ -16,6 +14,7 @@ import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||||
import us.codecraft.webmagic.scheduler.Scheduler;
|
import us.codecraft.webmagic.scheduler.Scheduler;
|
||||||
import us.codecraft.webmagic.thread.CountableThreadPool;
|
import us.codecraft.webmagic.thread.CountableThreadPool;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
import us.codecraft.webmagic.utils.WMCollections;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -173,9 +172,9 @@ public class Spider implements Runnable, Task {
|
||||||
*
|
*
|
||||||
* @param scheduler scheduler
|
* @param scheduler scheduler
|
||||||
* @return this
|
* @return this
|
||||||
* @Deprecated
|
|
||||||
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
|
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public Spider scheduler(Scheduler scheduler) {
|
public Spider scheduler(Scheduler scheduler) {
|
||||||
return setScheduler(scheduler);
|
return setScheduler(scheduler);
|
||||||
}
|
}
|
||||||
|
@ -499,7 +498,7 @@ public class Spider implements Runnable, Task {
|
||||||
}
|
}
|
||||||
|
|
||||||
public <T> T get(String url) {
|
public <T> T get(String url) {
|
||||||
List<String> urls = Lists.newArrayList(url);
|
List<String> urls = WMCollections.newArrayList(url);
|
||||||
List<T> resultItemses = getAll(urls);
|
List<T> resultItemses = getAll(urls);
|
||||||
if (resultItemses != null && resultItemses.size() > 0) {
|
if (resultItemses != null && resultItemses.size() > 0) {
|
||||||
return resultItemses.get(0);
|
return resultItemses.get(0);
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package us.codecraft.webmagic.downloader;
|
package us.codecraft.webmagic.downloader;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.http.HttpHost;
|
import org.apache.http.HttpHost;
|
||||||
|
@ -28,6 +27,7 @@ import us.codecraft.webmagic.proxy.Proxy;
|
||||||
import us.codecraft.webmagic.selector.PlainText;
|
import us.codecraft.webmagic.selector.PlainText;
|
||||||
import us.codecraft.webmagic.utils.HttpConstant;
|
import us.codecraft.webmagic.utils.HttpConstant;
|
||||||
import us.codecraft.webmagic.utils.UrlUtils;
|
import us.codecraft.webmagic.utils.UrlUtils;
|
||||||
|
import us.codecraft.webmagic.utils.WMCollections;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
||||||
charset = site.getCharset();
|
charset = site.getCharset();
|
||||||
headers = site.getHeaders();
|
headers = site.getHeaders();
|
||||||
} else {
|
} else {
|
||||||
acceptStatCode = Sets.newHashSet(200);
|
acceptStatCode = WMCollections.newHashSet(200);
|
||||||
}
|
}
|
||||||
logger.info("downloading page {}", request.getUrl());
|
logger.info("downloading page {}", request.getUrl());
|
||||||
CloseableHttpResponse httpResponse = null;
|
CloseableHttpResponse httpResponse = null;
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
package us.codecraft.webmagic.scheduler.component;
|
package us.codecraft.webmagic.scheduler.component;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||||
*/
|
*/
|
||||||
public class HashSetDuplicateRemover implements DuplicateRemover {
|
public class HashSetDuplicateRemover implements DuplicateRemover {
|
||||||
|
|
||||||
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
private Set<String> urls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isDuplicate(Request request, Task task) {
|
public boolean isDuplicate(Request request, Task task) {
|
||||||
|
|
|
@ -33,11 +33,11 @@ public abstract class Selectors {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @Deprecated
|
|
||||||
* @see #xpath(String)
|
* @see #xpath(String)
|
||||||
* @param expr expr
|
* @param expr expr
|
||||||
* @return new selector
|
* @return new selector
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public static XpathSelector xsoup(String expr) {
|
public static XpathSelector xsoup(String expr) {
|
||||||
return new XpathSelector(expr);
|
return new XpathSelector(expr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
package us.codecraft.webmagic.utils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 16/12/18
|
||||||
|
* Time: 上午10:16
|
||||||
|
*/
|
||||||
|
public class WMCollections {
|
||||||
|
|
||||||
|
public static <T> Set<T> newHashSet(T... t){
|
||||||
|
Set<T> set = new HashSet<T>(t.length);
|
||||||
|
for (T t1 : t) {
|
||||||
|
set.add(t1);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T> List<T> newArrayList(T... t){
|
||||||
|
List<T> set = new ArrayList<T>(t.length);
|
||||||
|
for (T t1 : t) {
|
||||||
|
set.add(t1);
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,6 +15,12 @@
|
||||||
<artifactId>jedis</artifactId>
|
<artifactId>jedis</artifactId>
|
||||||
<version>2.9.0</version>
|
<version>2.9.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.google.guava</groupId>
|
||||||
|
<artifactId>guava</artifactId>
|
||||||
|
<version>15.0</version>
|
||||||
|
<optional>true</optional>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>us.codecraft</groupId>
|
<groupId>us.codecraft</groupId>
|
||||||
<artifactId>webmagic-core</artifactId>
|
<artifactId>webmagic-core</artifactId>
|
||||||
|
|
|
@ -2,7 +2,6 @@ package us.codecraft.webmagic.configurable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-5
|
|
||||||
*/
|
*/
|
||||||
public enum ExpressionType {
|
public enum ExpressionType {
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ import static us.codecraft.webmagic.selector.Selectors.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-5
|
|
||||||
*/
|
*/
|
||||||
public class ExtractRule {
|
public class ExtractRule {
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
||||||
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
||||||
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
||||||
*
|
*
|
||||||
* @param phantomJsCommand
|
* @param phantomJsCommand phantomJsCommand
|
||||||
*/
|
*/
|
||||||
public PhantomJSDownloader(String phantomJsCommand) {
|
public PhantomJSDownloader(String phantomJsCommand) {
|
||||||
this.initPhantomjsCrawlPath();
|
this.initPhantomjsCrawlPath();
|
||||||
|
|
|
@ -9,7 +9,6 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-5
|
|
||||||
*/
|
*/
|
||||||
public class CompositePageProcessor implements PageProcessor {
|
public class CompositePageProcessor implements PageProcessor {
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ import us.codecraft.webmagic.Page;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-5
|
|
||||||
*/
|
*/
|
||||||
public interface SubPageProcessor extends RequestMatcher {
|
public interface SubPageProcessor extends RequestMatcher {
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,7 @@ public class SpiderMonitor {
|
||||||
*
|
*
|
||||||
* @param spiders spiders
|
* @param spiders spiders
|
||||||
* @return this
|
* @return this
|
||||||
|
* @throws JMException
|
||||||
*/
|
*/
|
||||||
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
|
||||||
for (Spider spider : spiders) {
|
for (Spider spider : spiders) {
|
||||||
|
|
|
@ -1,9 +1,16 @@
|
||||||
package us.codecraft.webmagic.scheduler.component;
|
package us.codecraft.webmagic.scheduler;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author code4crafter@gmail.com
|
||||||
|
* Date: 16/12/18
|
||||||
|
* Time: 上午10:23
|
||||||
|
*/
|
||||||
|
|
||||||
import com.google.common.hash.BloomFilter;
|
import com.google.common.hash.BloomFilter;
|
||||||
import com.google.common.hash.Funnels;
|
import com.google.common.hash.Funnels;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
@ -67,4 +74,4 @@ public class BloomFilterDuplicateRemover implements DuplicateRemover {
|
||||||
public int getTotalRequestsCount(Task task) {
|
public int getTotalRequestsCount(Task task) {
|
||||||
return counter.get();
|
return counter.get();
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -3,7 +3,6 @@ package us.codecraft.webmagic.scheduler;
|
||||||
import org.junit.Ignore;
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import us.codecraft.webmagic.Request;
|
import us.codecraft.webmagic.Request;
|
||||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
|
||||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||||
|
|
|
@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-9
|
|
||||||
*/
|
*/
|
||||||
public class BaiduNews {
|
public class BaiduNews {
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author code4crafter@gmail.com
|
* @author code4crafter@gmail.com
|
||||||
* @date 14-4-11
|
|
||||||
*/
|
*/
|
||||||
@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
|
@TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
|
||||||
@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true)
|
@ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true)
|
||||||
|
|
|
@ -5,8 +5,8 @@ import us.codecraft.webmagic.Site;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
import us.codecraft.webmagic.monitor.SpiderMonitor;
|
||||||
import us.codecraft.webmagic.processor.PageProcessor;
|
import us.codecraft.webmagic.processor.PageProcessor;
|
||||||
|
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
|
||||||
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
import us.codecraft.webmagic.scheduler.QueueScheduler;
|
||||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
|
||||||
|
|
||||||
import javax.management.JMException;
|
import javax.management.JMException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
|
@ -13,7 +13,7 @@ import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by dolphineor on 2014-11-21.
|
* Created by dolphineor on 2014-11-21.
|
||||||
* <p/>
|
* <p>
|
||||||
* 以淘宝为例, 搜索冬装的相关结果
|
* 以淘宝为例, 搜索冬装的相关结果
|
||||||
*/
|
*/
|
||||||
public class PhantomJSPageProcessor implements PageProcessor {
|
public class PhantomJSPageProcessor implements PageProcessor {
|
||||||
|
|
|
@ -19,9 +19,6 @@ public class OneFilePipeline extends FilePersistentBase implements Pipeline {
|
||||||
|
|
||||||
private PrintWriter printWriter;
|
private PrintWriter printWriter;
|
||||||
|
|
||||||
/**
|
|
||||||
* create a FilePipeline with default path"/data/webmagic/"
|
|
||||||
*/
|
|
||||||
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
|
public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException {
|
||||||
this("/data/webmagic/");
|
this("/data/webmagic/");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package us.codecraft.webmagic.scripts;
|
package us.codecraft.webmagic.scripts;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import org.apache.commons.cli.*;
|
import org.apache.commons.cli.*;
|
||||||
import org.apache.log4j.Level;
|
import org.apache.log4j.Level;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
@ -8,6 +7,7 @@ import us.codecraft.webmagic.ResultItems;
|
||||||
import us.codecraft.webmagic.Spider;
|
import us.codecraft.webmagic.Spider;
|
||||||
import us.codecraft.webmagic.Task;
|
import us.codecraft.webmagic.Task;
|
||||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||||
|
import us.codecraft.webmagic.utils.WMCollections;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -29,8 +29,8 @@ public class ScriptConsole {
|
||||||
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
|
private static Map<Language, Set<String>> alias = new HashMap<Language, Set<String>>();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
alias.put(Language.JavaScript, Sets.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
alias.put(Language.JavaScript, WMCollections.<String>newHashSet("js", "javascript", "JavaScript", "JS"));
|
||||||
alias.put(Language.JRuby, Sets.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
alias.put(Language.JRuby, WMCollections.<String>newHashSet("ruby", "jruby", "Ruby", "JRuby"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLanguagefromArg(String arg) {
|
public void setLanguagefromArg(String arg) {
|
||||||
|
@ -93,7 +93,7 @@ public class ScriptConsole {
|
||||||
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
|
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
|
||||||
pageProcessor.getSite().setSleepTime(params.getSleepTime());
|
pageProcessor.getSite().setSleepTime(params.getSleepTime());
|
||||||
pageProcessor.getSite().setRetryTimes(3);
|
pageProcessor.getSite().setRetryTimes(3);
|
||||||
pageProcessor.getSite().setAcceptStatCode(Sets.<Integer>newHashSet(200, 404,403, 500,502));
|
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
|
||||||
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
|
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
|
||||||
spider.clearPipeline().addPipeline(new Pipeline() {
|
spider.clearPipeline().addPipeline(new Pipeline() {
|
||||||
@Override
|
@Override
|
||||||
|
|
Loading…
Reference in New Issue